Completed
Push — remove-realurl ( 36a4b4 )
by Tomas Norre
08:59
created

CrawlerController::urlListFromUrlArray()   C

Complexity

Conditions 10
Paths 2

Size

Total Lines 73

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 29
CRAP Score 10.5035

Importance

Changes 0
Metric Value
cc 10
nc 2
nop 9
dl 0
loc 73
ccs 29
cts 35
cp 0.8286
crap 10.5035
rs 6.7224
c 0
b 0
f 0

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Domain\Repository\ProcessRepository;
29
use AOE\Crawler\Domain\Repository\QueueRepository;
30
use AOE\Crawler\Event\EventDispatcher;
31
use AOE\Crawler\Utility\IconUtility;
32
use AOE\Crawler\Utility\SignalSlotUtility;
33
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
34
use TYPO3\CMS\Backend\Utility\BackendUtility;
35
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
36
use TYPO3\CMS\Core\Database\Connection;
37
use TYPO3\CMS\Core\Database\ConnectionPool;
38
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
39
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
40
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
41
use TYPO3\CMS\Core\Log\Logger;
42
use TYPO3\CMS\Core\Log\LogLevel;
43
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
44
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
45
use TYPO3\CMS\Core\Utility\DebugUtility;
46
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
47
use TYPO3\CMS\Core\Utility\GeneralUtility;
48
use TYPO3\CMS\Core\Utility\MathUtility;
49
use TYPO3\CMS\Extbase\Object\ObjectManager;
50
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
51
use TYPO3\CMS\Frontend\Page\PageRepository;
52
use TYPO3\CMS\Frontend\Utility\EidUtility;
53
use TYPO3\CMS\Lang\LanguageService;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController
61
{
62
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
63
    const CLI_STATUS_REMAIN = 1; //queue not empty
64
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
65
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
66
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
67
68
    /**
69
     * @var integer
70
     */
71
    public $setID = 0;
72
73
    /**
74
     * @var string
75
     */
76
    public $processID = '';
77
78
    /**
79
     * One hour is max stalled time for the CLI
80
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
81
     *
82
     * @var integer
83
     */
84
    public $max_CLI_exec_time = 3600;
85
86
    /**
87
     * @var array
88
     */
89
    public $duplicateTrack = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $downloadUrls = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingProcInstructions = [];
100
101
    /**
102
     * @var array
103
     */
104
    public $incomingConfigurationSelection = [];
105
106
    /**
107
     * @var bool
108
     */
109
    public $registerQueueEntriesInternallyOnly = false;
110
111
    /**
112
     * @var array
113
     */
114
    public $queueEntries = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $urlList = [];
120
121
    /**
122
     * @var boolean
123
     */
124
    public $debugMode = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $extensionSettings = [];
130
131
    /**
132
     * Mount Point
133
     *
134
     * @var boolean
135
     */
136
    public $MP = false;
137
138
    /**
139
     * @var string
140
     */
141
    protected $processFilename;
142
143
    /**
144
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
145
     *
146
     * @var string
147
     */
148
    protected $accessMode;
149
150
    /**
151
     * @var BackendUserAuthentication
152
     */
153
    private $backendUser;
154
155
    /**
156
     * @var integer
157
     */
158
    private $scheduledTime = 0;
159
160
    /**
161
     * @var integer
162
     */
163
    private $reqMinute = 0;
164
165
    /**
166
     * @var bool
167
     */
168
    private $submitCrawlUrls = false;
169
170
    /**
171
     * @var bool
172
     */
173
    private $downloadCrawlUrls = false;
174
175
    /**
176
     * @var QueueRepository
177
     */
178
    protected $queueRepository;
179
180
    /**
181
     * @var ProcessRepository
182
     */
183
    protected $processRepository;
184
185
    /**
186
     * @var string
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var array
192
     */
193
    private $cliArgs;
194
195
    /**
196
     * @var Logger
197
     */
198
    private $logger;
199
200
    /**
201
     * Method to set the accessMode can be gui, cli or cli_im
202
     *
203
     * @return string
204
     */
205 1
    public function getAccessMode()
206
    {
207 1
        return $this->accessMode;
208
    }
209
210
    /**
211
     * @param string $accessMode
212
     */
213 1
    public function setAccessMode($accessMode)
214
    {
215 1
        $this->accessMode = $accessMode;
216 1
    }
217
218
    /**
219
     * Set disabled status to prevent processes from being processed
220
     *
221
     * @param  bool $disabled (optional, defaults to true)
222
     * @return void
223
     */
224 3
    public function setDisabled($disabled = true)
225
    {
226 3
        if ($disabled) {
227 2
            GeneralUtility::writeFile($this->processFilename, '');
228
        } else {
229 1
            if (is_file($this->processFilename)) {
230 1
                unlink($this->processFilename);
231
            }
232
        }
233 3
    }
234
235
    /**
236
     * Get disable status
237
     *
238
     * @return bool true if disabled
239
     */
240 3
    public function getDisabled()
241
    {
242 3
        return is_file($this->processFilename);
243
    }
244
245
    /**
246
     * @param string $filenameWithPath
247
     *
248
     * @return void
249
     */
250 4
    public function setProcessFilename($filenameWithPath)
251
    {
252 4
        $this->processFilename = $filenameWithPath;
253 4
    }
254
255
    /**
256
     * @return string
257
     */
258 1
    public function getProcessFilename()
259
    {
260 1
        return $this->processFilename;
261
    }
262
263
    /**
264
     * @return Logger
265
     */
266
    private function getLogger(): Logger
267
    {
268
        if ($this->logger === null) {
269
            $this->logger = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Log\LogManager::class)->getLogger(__CLASS__);
270
        }
271
        return $this->logger;
272
    }
273
274
    /************************************
275
     *
276
     * Getting URLs based on Page TSconfig
277
     *
278
     ************************************/
279
280 34
    public function __construct()
281
    {
282 34
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
283 34
        $this->queueRepository = $objectManager->get(QueueRepository::class);
284 34
        $this->processRepository = $objectManager->get(ProcessRepository::class);
285
286 34
        $this->backendUser = $GLOBALS['BE_USER'];
287 34
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
288
289 34
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
290 34
        $settings = is_array($settings) ? $settings : [];
291
292
        // read ext_em_conf_template settings and set
293 34
        $this->setExtensionSettings($settings);
294
295
        // set defaults:
296 34
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
297
            $this->extensionSettings['countInARun'] = 100;
298
        }
299
300 34
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
301 34
    }
302
303
    /**
304
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
305
     *
306
     * @param array $extensionSettings
307
     * @return void
308
     */
309 43
    public function setExtensionSettings(array $extensionSettings)
310
    {
311 43
        $this->extensionSettings = $extensionSettings;
312 43
    }
313
314
    /**
315
     * Check if the given page should be crawled
316
     *
317
     * @param array $pageRow
318
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
319
     */
320 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
321
    {
322 8
        $skipPage = false;
323 8
        $skipMessage = 'Skipped'; // message will be overwritten later
324
325
        // if page is hidden
326 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
327 8
            if ($pageRow['hidden']) {
328 1
                $skipPage = true;
329 1
                $skipMessage = 'Because page is hidden';
330
            }
331
        }
332
333 8
        if (!$skipPage) {
334 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
335 3
                $skipPage = true;
336 3
                $skipMessage = 'Because doktype is not allowed';
337
            }
338
        }
339
340 8
        if (!$skipPage) {
341 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
342 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
343 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
344 1
                        $skipPage = true;
345 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
346 1
                        break;
347
                    }
348
                }
349
            }
350
        }
351
352 8
        if (!$skipPage) {
353
            // veto hook
354 3
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
355
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
356
                    $params = [
357
                        'pageRow' => $pageRow
358
                    ];
359
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
360
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
361
                    if ($veto !== false) {
362
                        $skipPage = true;
363
                        if (is_string($veto)) {
364
                            $skipMessage = $veto;
365
                        } else {
366
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
367
                        }
368
                        // no need to execute other hooks if a previous one return a veto
369
                        break;
370
                    }
371
                }
372
            }
373
        }
374
375 8
        return $skipPage ? $skipMessage : false;
376
    }
377
378
    /**
379
     * Wrapper method for getUrlsForPageId()
380
     * It returns an array of configurations and no urls!
381
     *
382
     * @param array $pageRow Page record with at least dok-type and uid columns.
383
     * @param string $skipMessage
384
     * @return array
385
     * @see getUrlsForPageId()
386
     */
387 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
388
    {
389 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
390
391 4
        if ($message === false) {
392 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
393 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
394 3
            $skipMessage = '';
395
        } else {
396 1
            $skipMessage = $message;
397 1
            $res = [];
398
        }
399
400 4
        return $res;
401
    }
402
403
    /**
404
     * This method is used to count if there are ANY unprocessed queue entries
405
     * of a given page_id and the configuration which matches a given hash.
406
     * If there if none, we can skip an inner detail check
407
     *
408
     * @param  int $uid
409
     * @param  string $configurationHash
410
     * @return boolean
411
     */
412 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
413
    {
414 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
415 5
        $noUnprocessedQueueEntriesFound = true;
416
417
        $result = $queryBuilder
418 5
            ->count('*')
419 5
            ->from($this->tableName)
420 5
            ->where(
421 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
422 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
423 5
                $queryBuilder->expr()->eq('exec_time', 0)
424
            )
425 5
            ->execute()
426 5
            ->fetchColumn();
427
428 5
        if ($result) {
429 3
            $noUnprocessedQueueEntriesFound = false;
430
        }
431
432 5
        return $noUnprocessedQueueEntriesFound;
433
    }
434
435
    /**
436
     * Creates a list of URLs from input array (and submits them to queue if asked for)
437
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
438
     *
439
     * @param    array        Information about URLs from pageRow to crawl.
440
     * @param    array        Page row
441
     * @param    integer        Unix time to schedule indexing to, typically time()
442
     * @param    integer        Number of requests per minute (creates the interleave between requests)
443
     * @param    boolean        If set, submits the URLs to queue
444
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
445
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
446
     * @param    array        Array which will be filled with URLS for download if flag is set.
447
     * @param    array        Array of processing instructions
448
     * @return    string        List of URLs (meant for display in backend module)
449
     *
450
     */
451 2
    public function urlListFromUrlArray(
452
        array $vv,
453
        array $pageRow,
454
        $scheduledTime,
455
        $reqMinute,
456
        $submitCrawlUrls,
457
        $downloadCrawlUrls,
458
        array &$duplicateTrack,
459
        array &$downloadUrls,
460
        array $incomingProcInstructions
461
    ) {
462 2
        $urlList = '';
463
464 2
        if (is_array($vv['URLs'])) {
465 2
            $configurationHash = $this->getConfigurationHash($vv);
466 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
467
468 2
            foreach ($vv['URLs'] as $urlQuery) {
469 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
470
471
                    // Calculate cHash:
472 2
                    if ($vv['subCfg']['cHash']) {
473
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
474
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
475
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
476
                    }
477
478
                    // Create key by which to determine unique-ness:
479 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
480 2
                    $urlQuery = 'index.php' . $urlQuery;
481
482
                    // Scheduled time:
483 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
484 2
                    $schTime = floor($schTime / 60) * 60;
485
486 2
                    if (isset($duplicateTrack[$uKey])) {
487
488
                        //if the url key is registered just display it and do not resubmit is
489
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
490
                    } else {
491 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
492 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
493
494 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
495
496
                        // Submit for crawling!
497 2
                        if ($submitCrawlUrls) {
498 2
                            $added = $this->addUrl(
499 2
                                $pageRow['uid'],
500 2
                                $theUrl,
501 2
                                $vv['subCfg'],
502 2
                                $scheduledTime,
503 2
                                $configurationHash,
504 2
                                $skipInnerCheck
505
                            );
506 2
                            if ($added === false) {
507 2
                                $urlList .= ' (Url already existed)';
508
                            }
509
                        } elseif ($downloadCrawlUrls) {
510
                            $downloadUrls[$theUrl] = $theUrl;
511
                        }
512
513 2
                        $urlList .= '<br />';
514
                    }
515 2
                    $duplicateTrack[$uKey] = true;
516
                }
517
            }
518
        } else {
519
            $urlList = 'ERROR - no URL generated';
520
        }
521
522 2
        return $urlList;
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527
     *
528
     * @param string $piString PI to test
529
     * @param array $incomingProcInstructions Processing instructions
530
     * @return boolean
531
     */
532 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533
    {
534 5
        if (empty($incomingProcInstructions)) {
535 1
            return true;
536
        }
537
538 4
        foreach ($incomingProcInstructions as $pi) {
539 4
            if (GeneralUtility::inList($piString, $pi)) {
540 2
                return true;
541
            }
542
        }
543 2
    }
544
545 2
    public function getPageTSconfigForId($id)
546
    {
547 2
        if (!$this->MP) {
548 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
549
        } else {
550
            list(, $mountPointId) = explode('-', $this->MP);
551
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
552
        }
553
554
        // Call a hook to alter configuration
555 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
556
            $params = [
557
                'pageId' => $id,
558
                'pageTSConfig' => &$pageTSconfig
559
            ];
560
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
561
                GeneralUtility::callUserFunction($userFunc, $params, $this);
562
            }
563
        }
564
565 2
        return $pageTSconfig;
566
    }
567
568
    /**
569
     * This methods returns an array of configurations.
570
     * And no urls!
571
     *
572
     * @param integer $id Page ID
573
     * @param bool $forceSsl Use https
574
     * @return array
575
     */
576 2
    public function getUrlsForPageId($id, $forceSsl = false)
577
    {
578
579
        /**
580
         * Get configuration from tsConfig
581
         */
582
583
        // Get page TSconfig for page ID:
584 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
585
586 2
        $res = [];
587
588 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
589 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
590
591 1
            if (is_array($crawlerCfg['paramSets.'])) {
592 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
593 1
                    if (is_array($values)) {
594 1
                        $key = str_replace('.', '', $key);
595
                        // Sub configuration for a single configuration string:
596 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
597 1
                        $subCfg['key'] = $key;
598
599 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
600 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
601
                        }
602 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
603
604
                        // process configuration if it is not page-specific or if the specific page is the current page:
605 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
606
607
                                // add trailing slash if not present
608 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
609
                                $subCfg['baseUrl'] .= '/';
610
                            }
611
612
                            // Explode, process etc.:
613 1
                            $res[$key] = [];
614 1
                            $res[$key]['subCfg'] = $subCfg;
615 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
616 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
617 1
                            $res[$key]['origin'] = 'pagets';
618
619
                            // recognize MP value
620 1
                            if (!$this->MP) {
621 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
622
                            } else {
623
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
624
                            }
625
                        }
626
                    }
627
                }
628
            }
629
        }
630
631
        /**
632
         * Get configuration from tx_crawler_configuration records
633
         */
634
635
        // get records along the rootline
636 2
        $rootLine = BackendUtility::BEgetRootLine($id);
637
638 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
639 2
        $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
640
641 2
        foreach ($rootLine as $page) {
642
            $configurationRecordsForCurrentPage = $queryBuilder
643 2
                ->select('*')
644 2
                ->from('tx_crawler_configuration')
645 2
                ->where(
646 2
                    $queryBuilder->expr()->eq('pid', $page['uid']),
647 2
                    substr(BackendUtility::BEenableFields('tx_crawler_configuration'), 4) . BackendUtility::deleteClause('tx_crawler_configuration')
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...Utility::deleteClause() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0, the DeletedRestriction functionality should be used instead.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
648
                )
649 2
                ->execute()
650 2
                ->fetchAll();
651
652 2
            if (is_array($configurationRecordsForCurrentPage)) {
653 2
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
654
655
                        // check access to the configuration record
656 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
657 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
658
659
                        // process configuration if it is not page-specific or if the specific page is the current page:
660 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
661 1
                            $key = $configurationRecord['name'];
662
663
                            // don't overwrite previously defined paramSets
664 1
                            if (!isset($res[$key])) {
665
666
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
667 1
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
668 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
669
670 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
671
672
                                $subCfg = [
673 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
674 1
                                    'procInstrParams.' => $TSparserObject->setup,
675 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
676 1
                                        $configurationRecord['base_url'],
677 1
                                        $configurationRecord['sys_domain_base_url'],
678 1
                                        $isCrawlingProtocolHttps
679
                                    ),
680 1
                                    'cHash' => $configurationRecord['chash'],
681 1
                                    'userGroups' => $configurationRecord['fegroups'],
682 1
                                    'exclude' => $configurationRecord['exclude'],
683 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
684 1
                                    'key' => $key
685
                                ];
686
687
                                // add trailing slash if not present
688 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
689
                                    $subCfg['baseUrl'] .= '/';
690
                                }
691 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
692 1
                                    $res[$key] = [];
693 1
                                    $res[$key]['subCfg'] = $subCfg;
694 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
695 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
696 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
697 1
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
698
                                }
699
                            }
700
                        }
701
                    }
702
                }
703
            }
704
        }
705
706 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
707
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
708
                $params = [
709
                    'res' => &$res,
710
                ];
711
                GeneralUtility::callUserFunction($func, $params, $this);
712
            }
713
        }
714
715 2
        return $res;
716
    }
717
718
    /**
719
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
720
     *
721
     * @param string $baseUrl
722
     * @param integer $sysDomainUid
723
     * @param bool $ssl
724
     * @return string
725
     */
726 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
727
    {
728 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
729 4
        $sysDomainUid = intval($sysDomainUid);
730 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
731
732 4
        if ($sysDomainUid > 0) {
733
            $statement = $queryBuilder
734 2
                ->from('sys_domain')
735 2
                ->select('*')
736 2
                ->where(
737 2
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
738
                )
739 2
                ->execute();
740
741 2
            $row = $statement->fetch(0);
742 2
            if ($row['domainName'] != '') {
743 1
                return $urlScheme . '://' . $row['domainName'];
744
            }
745
        }
746 3
        return $baseUrl;
747
    }
748
749
    /**
750
     * @param $rootid
751
     * @param $depth
752
     * @return array
753
     *
754
     * TODO: Write Functional Tests
755
     */
756
    public function getConfigurationsForBranch($rootid, $depth)
757
    {
758
        $configurationsForBranch = [];
759
760
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
761
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
762
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
763
            if (is_array($sets)) {
764
                foreach ($sets as $key => $value) {
765
                    if (!is_array($value)) {
766
                        continue;
767
                    }
768
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
769
                }
770
            }
771
        }
772
        $pids = [];
773
        $rootLine = BackendUtility::BEgetRootLine($rootid);
774
        foreach ($rootLine as $node) {
775
            $pids[] = $node['uid'];
776
        }
777
        /* @var PageTreeView $tree */
778
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
779
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
780
        $tree->init('AND ' . $perms_clause);
781
        $tree->getTree($rootid, $depth, '');
782
        foreach ($tree->tree as $node) {
783
            $pids[] = $node['row']['uid'];
784
        }
785
786
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
787
788
        $queryBuilder->getRestrictions()
789
            ->removeAll()
790
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
791
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
792
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
793
794
        $statement = $queryBuilder
795
            ->select('name')
796
            ->from('tx_crawler_configuration')
797
            ->where(
798
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
799
            )
800
        ->execute();
801
802
        while ($row = $statement->fetch()) {
803
            $configurationsForBranch[] = $row['name'];
804
        }
805
806
        return $configurationsForBranch;
807
    }
808
809
    /**
810
     * Get querybuilder for given table
811
     *
812
     * @param string $table
813
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
814
     */
815 9
    private function getQueryBuilder(string $table)
816
    {
817 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
818 9
            ->getConnectionForTable($table)
819 9
            ->createQueryBuilder();
820
    }
821
822
    /**
823
     * Check if a user has access to an item
824
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
825
     *
826
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
827
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
828
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
829
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
830
     */
831 3
    public function hasGroupAccess($groupList, $accessList)
832
    {
833 3
        if (empty($accessList)) {
834 1
            return true;
835
        }
836 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
837 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
838 1
                return true;
839
            }
840
        }
841 1
        return false;
842
    }
843
844
    /**
845
     * Parse GET vars of input Query into array with key=>value pairs
846
     *
847
     * @param string $inputQuery Input query string
848
     * @return array
849
     */
850 5
    public function parseParams($inputQuery)
851
    {
852
        //echo '<pre>', var_dump($inputQuery), '</pre>';
853
        // Extract all GET parameters into an ARRAY:
854 5
        $paramKeyValues = [];
855 5
        $GETparams = explode('&', $inputQuery);
856
857 5
        foreach ($GETparams as $paramAndValue) {
858 5
            list($p, $v) = explode('=', $paramAndValue, 2);
859 5
            if (strlen($p)) {
860 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
861
            }
862
        }
863
864 5
        return $paramKeyValues;
865
    }
866
867
    /**
868
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
869
     * Syntax of values:
870
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
871
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
872
     * - For each configuration part:
873
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
874
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
875
     *        _ENABLELANG:1 picks only original records without their language overlays
876
     *         - Default: Literal value
877
     *
878
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
879
     * @param integer $pid Current page ID
880
     * @return array
881
     *
882
     * TODO: Write Functional Tests
883
     */
884 2
    public function expandParameters($paramArray, $pid)
885
    {
886 2
        global $TCA;
887
888
        // Traverse parameter names:
889 2
        foreach ($paramArray as $p => $v) {
890 2
            $v = trim($v);
891
892
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
893 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
894
                // So, find the value inside brackets and reset the paramArray value as an array.
895 2
                $v = substr($v, 1, -1);
896 2
                $paramArray[$p] = [];
897
898
                // Explode parts and traverse them:
899 2
                $parts = explode('|', $v);
900 2
                foreach ($parts as $pV) {
901
902
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
903 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
904
905
                        // Swap if first is larger than last:
906
                        if ($reg[1] > $reg[2]) {
907
                            $temp = $reg[2];
908
                            $reg[2] = $reg[1];
909
                            $reg[1] = $temp;
910
                        }
911
912
                        // Traverse range, add values:
913
                        $runAwayBrake = 1000; // Limit to size of range!
914
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
915
                            $paramArray[$p][] = $a;
916
                            $runAwayBrake--;
917
                            if ($runAwayBrake <= 0) {
918
                                break;
919
                            }
920
                        }
921 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
922
923
                        // Parse parameters:
924
                        $subparts = GeneralUtility::trimExplode(';', $pV);
925
                        $subpartParams = [];
926
                        foreach ($subparts as $spV) {
927
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
928
                            $subpartParams[$pKey] = $pVal;
929
                        }
930
931
                        // Table exists:
932
                        if (isset($TCA[$subpartParams['_TABLE']])) {
933
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
934
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
935
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
936
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
937
938
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
939
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
940
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
941
942
                                $queryBuilder->getRestrictions()
943
                                    ->removeAll()
944
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
945
946
                                $queryBuilder
947
                                    ->select($fieldName)
948
                                    ->from($subpartParams['_TABLE'])
949
                                    // TODO: Check if this works as intended!
950
                                    ->add('from', $addTable)
951
                                    ->where(
952
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
953
                                        $where
954
                                    );
955
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
956
957
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
958
                                    $queryBuilder->andWhere(
959
                                        $queryBuilder->expr()->lte(
960
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
961
                                            0
962
                                        )
963
                                    );
964
                                }
965
966
                                $statement = $queryBuilder->execute();
967
968
                                $rows = [];
969
                                while ($row = $statement->fetch()) {
970
                                    $rows[$fieldName] = $row;
971
                                }
972
973
                                if (is_array($rows)) {
974
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
975
                                }
976
                            }
977
                        }
978
                    } else { // Just add value:
979 2
                        $paramArray[$p][] = $pV;
980
                    }
981
                    // Hook for processing own expandParameters place holder
982 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
983
                        $_params = [
984
                            'pObj' => &$this,
985
                            'paramArray' => &$paramArray,
986
                            'currentKey' => $p,
987
                            'currentValue' => $pV,
988
                            'pid' => $pid
989
                        ];
990
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
991
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
992
                        }
993
                    }
994
                }
995
996
                // Make unique set of values and sort array by key:
997 2
                $paramArray[$p] = array_unique($paramArray[$p]);
998 2
                ksort($paramArray);
999
            } else {
1000
                // Set the literal value as only value in array:
1001 2
                $paramArray[$p] = [$v];
1002
            }
1003
        }
1004
1005 2
        return $paramArray;
1006
    }
1007
1008
    /**
1009
     * Compiling URLs from parameter array (output of expandParameters())
1010
     * The number of URLs will be the multiplication of the number of parameter values for each key
1011
     *
1012
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1013
     * @param array $urls URLs accumulated in this array (for recursion)
1014
     * @return array
1015
     */
1016 5
    public function compileUrls($paramArray, $urls = [])
1017
    {
1018 5
        if (count($paramArray) && is_array($urls)) {
1019
            // shift first off stack:
1020 4
            reset($paramArray);
1021 4
            $varName = key($paramArray);
1022 4
            $valueSet = array_shift($paramArray);
1023
1024
            // Traverse value set:
1025 4
            $newUrls = [];
1026 4
            foreach ($urls as $url) {
1027 3
                foreach ($valueSet as $val) {
1028 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1029
1030 3
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1031
                        break;
1032
                    }
1033
                }
1034
            }
1035 4
            $urls = $newUrls;
1036 4
            $urls = $this->compileUrls($paramArray, $urls);
1037
        }
1038
1039 5
        return $urls;
1040
    }
1041
1042
    /************************************
1043
     *
1044
     * Crawler log
1045
     *
1046
     ************************************/
1047
1048
    /**
1049
     * Return array of records from crawler queue for input page ID
1050
     *
1051
     * @param integer $id Page ID for which to look up log entries.
1052
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1053
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1054
     * @param boolean $doFullFlush
1055
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1056
     * @return array
1057
     */
1058 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1059
    {
1060 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1061
        $queryBuilder
1062 4
            ->select('*')
1063 4
            ->from($this->tableName)
1064 4
            ->where(
1065 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1066
            )
1067 4
            ->orderBy('scheduled', 'DESC');
1068
1069 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1070 4
            ->getConnectionForTable($this->tableName)
1071 4
            ->getExpressionBuilder();
1072 4
        $query = $expressionBuilder->andX();
1073
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1074
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1075
        // between the statements, it's not a mistake in the code.
1076 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1077 4
        switch ($filter) {
1078 4
            case 'pending':
1079
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1080
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1081
                break;
1082 4
            case 'finished':
1083
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1084
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1085
                break;
1086
        }
1087
1088
        // FIXME: Write unit test that ensures that the right records are deleted.
1089 4
        if ($doFlush) {
1090 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1091 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1092 2
            return [];
1093
        } else {
1094 2
            if ($itemsPerPage > 0) {
1095
                $queryBuilder
1096 2
                    ->setMaxResults((int)$itemsPerPage);
1097
            }
1098
1099 2
            return $queryBuilder->execute()->fetchAll();
1100
        }
1101
    }
1102
1103
    /**
1104
     * Return array of records from crawler queue for input set ID
1105
     *
1106
     * @param integer $set_id Set ID for which to look up log entries.
1107
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1108
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1109
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1110
     * @return array
1111
     */
1112 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1113
    {
1114 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1115
        $queryBuilder
1116 6
            ->select('*')
1117 6
            ->from($this->tableName)
1118 6
            ->where(
1119 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1120
            )
1121 6
            ->orderBy('scheduled', 'DESC');
1122
1123 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1124 6
            ->getConnectionForTable($this->tableName)
1125 6
            ->getExpressionBuilder();
1126 6
        $query = $expressionBuilder->andX();
1127
        // FIXME: Write Unit tests for Filters
1128
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1129
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1130
        // between the statements, it's not a mistake in the code.
1131 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1132 6
        switch ($filter) {
1133 6
            case 'pending':
1134 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1135 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1136 1
                break;
1137 5
            case 'finished':
1138 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1139 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1140 1
                break;
1141
        }
1142
        // FIXME: Write unit test that ensures that the right records are deleted.
1143 6
        if ($doFlush) {
1144 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1145 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1146 4
            return [];
1147
        } else {
1148 2
            if ($itemsPerPage > 0) {
1149
                $queryBuilder
1150 2
                    ->setMaxResults((int)$itemsPerPage);
1151
            }
1152
1153 2
            return $queryBuilder->execute()->fetchAll();
1154
        }
1155
    }
1156
1157
    /**
1158
     * Removes queue entries
1159
     *
1160
     * @param string $where SQL related filter for the entries which should be removed
1161
     * @return void
1162
     */
1163 9
    protected function flushQueue($where = '')
1164
    {
1165 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1166
1167 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1168
1169 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1170
            $groups = $queryBuilder
1171
                ->select('DISTINCT set_id')
1172
                ->from($this->tableName)
1173
                ->where($realWhere)
1174
                ->execute()
1175
                ->fetchAll();
1176
            if (is_array($groups)) {
1177
                foreach ($groups as $group) {
1178
                    $subSet = $queryBuilder
1179
                        ->select('uid', 'set_id')
1180
                        ->from($this->tableName)
1181
                        ->where(
1182
                            $realWhere,
1183
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1184
                        )
1185
                        ->execute()
1186
                        ->fetchAll();
1187
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1188
                }
1189
            }
1190
        }
1191
1192
        $queryBuilder
1193 9
            ->delete($this->tableName)
1194 9
            ->where($realWhere)
1195 9
            ->execute();
1196 9
    }
1197
1198
    /**
1199
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1200
     *
1201
     * @param integer $setId Set ID
1202
     * @param array $params Parameters to pass to call back function
1203
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1204
     * @param integer $page_id Page ID to attach it to
1205
     * @param integer $schedule Time at which to activate
1206
     * @return void
1207
     */
1208
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1209
    {
1210
        if (!is_array($params)) {
1211
            $params = [];
1212
        }
1213
        $params['_CALLBACKOBJ'] = $callBack;
1214
1215
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1216
            ->insert(
1217
                'tx_crawler_queue',
1218
                [
1219
                    'page_id' => intval($page_id),
1220
                    'parameters' => serialize($params),
1221
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1222
                    'exec_time' => 0,
1223
                    'set_id' => intval($setId),
1224
                    'result_data' => '',
1225
                ]
1226
            );
1227
    }
1228
1229
    /************************************
1230
     *
1231
     * URL setting
1232
     *
1233
     ************************************/
1234
1235
    /**
1236
     * Setting a URL for crawling:
1237
     *
1238
     * @param integer $id Page ID
1239
     * @param string $url Complete URL
1240
     * @param array $subCfg Sub configuration array (from TS config)
1241
     * @param integer $tstamp Scheduled-time
1242
     * @param string $configurationHash (optional) configuration hash
1243
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1244
     * @return bool
1245
     */
1246 2
    public function addUrl(
1247
        $id,
1248
        $url,
1249
        array $subCfg,
1250
        $tstamp,
1251
        $configurationHash = '',
1252
        $skipInnerDuplicationCheck = false
1253
    ) {
1254 2
        $urlAdded = false;
1255 2
        $rows = [];
1256
1257
        // Creating parameters:
1258
        $parameters = [
1259 2
            'url' => $url
1260
        ];
1261
1262
        // fe user group simulation:
1263 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1264 2
        if ($uGs) {
1265
            $parameters['feUserGroupList'] = $uGs;
1266
        }
1267
1268
        // Setting processing instructions
1269 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1270 2
        if (is_array($subCfg['procInstrParams.'])) {
1271 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1272
        }
1273
1274
        // Possible TypoScript Template Parents
1275 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1276
1277
        // Compile value array:
1278 2
        $parameters_serialized = serialize($parameters);
1279
        $fieldArray = [
1280 2
            'page_id' => intval($id),
1281 2
            'parameters' => $parameters_serialized,
1282 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1283 2
            'configuration_hash' => $configurationHash,
1284 2
            'scheduled' => $tstamp,
1285 2
            'exec_time' => 0,
1286 2
            'set_id' => intval($this->setID),
1287 2
            'result_data' => '',
1288 2
            'configuration' => $subCfg['key'],
1289
        ];
1290
1291 2
        if ($this->registerQueueEntriesInternallyOnly) {
1292
            //the entries will only be registered and not stored to the database
1293
            $this->queueEntries[] = $fieldArray;
1294
        } else {
1295 2
            if (!$skipInnerDuplicationCheck) {
1296
                // check if there is already an equal entry
1297 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1298
            }
1299
1300 2
            if (count($rows) == 0) {
1301 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1302 2
                $connectionForCrawlerQueue->insert(
1303 2
                    'tx_crawler_queue',
1304 2
                    $fieldArray
1305
                );
1306 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1307 2
                $rows[] = $uid;
1308 2
                $urlAdded = true;
1309 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1310
            } else {
1311
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1312
            }
1313
        }
1314
1315 2
        return $urlAdded;
1316
    }
1317
1318
    /**
1319
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1320
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1321
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1322
     *
1323
     * @param int $tstamp
1324
     * @param array $fieldArray
1325
     *
1326
     * @return array
1327
     *
1328
     * TODO: Write Functional Tests
1329
     */
1330 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1331
    {
1332 2
        $rows = [];
1333
1334 2
        $currentTime = $this->getCurrentTime();
1335
1336 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1337
        $queryBuilder
1338 2
            ->select('qid')
1339 2
            ->from('tx_crawler_queue');
1340
        //if this entry is scheduled with "now"
1341 2
        if ($tstamp <= $currentTime) {
1342
            if ($this->extensionSettings['enableTimeslot']) {
1343
                $timeBegin = $currentTime - 100;
1344
                $timeEnd = $currentTime + 100;
1345
                $queryBuilder
1346
                    ->where(
1347
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1348
                    )
1349
                    ->orWhere(
1350
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1351
                    );
1352
            } else {
1353
                $queryBuilder
1354
                    ->where(
1355
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1356
                    );
1357
            }
1358 2
        } elseif ($tstamp > $currentTime) {
1359
            //entry with a timestamp in the future need to have the same schedule time
1360
            $queryBuilder
1361 2
                ->where(
1362 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1363
                );
1364
        }
1365
1366
        $statement = $queryBuilder
1367 2
            ->andWhere('exec_time != 0')
1368 2
            ->andWhere('process_id != 0')
1369 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1370 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1371 2
            ->execute();
1372
1373 2
        while ($row = $statement->fetch()) {
1374
            $rows[] = $row['qid'];
1375
        }
1376
1377 2
        return $rows;
1378
    }
1379
1380
    /**
1381
     * Returns the current system time
1382
     *
1383
     * @return int
1384
     */
1385
    public function getCurrentTime()
1386
    {
1387
        return time();
1388
    }
1389
1390
    /************************************
1391
     *
1392
     * URL reading
1393
     *
1394
     ************************************/
1395
1396
    /**
1397
     * Read URL for single queue entry
1398
     *
1399
     * @param integer $queueId
1400
     * @param boolean $force If set, will process even if exec_time has been set!
1401
     * @return integer
1402
     */
1403
    public function readUrl($queueId, $force = false)
1404
    {
1405
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1406
        $ret = 0;
1407
        if ($this->debugMode) {
1408
            $this->getLogger()->log(
1409
                LogLevel::DEBUG,
1410
                'crawler-readurl start ' . microtime(true)
1411
            );
1412
        }
1413
        // Get entry:
1414
        $queryBuilder
1415
            ->select('*')
1416
            ->from('tx_crawler_queue')
1417
            ->where(
1418
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1419
            );
1420
        if (!$force) {
1421
            $queryBuilder
1422
                ->andWhere('exec_time = 0')
1423
                ->andWhere('process_scheduled > 0');
1424
        }
1425
        $queueRec = $queryBuilder->execute()->fetch();
1426
1427
        if (!is_array($queueRec)) {
1428
            return;
1429
        }
1430
1431
        $parameters = unserialize($queueRec['parameters']);
1432
        if ($parameters['rootTemplatePid']) {
1433
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1434
        } else {
1435
            $this->getLogger()->log(
1436
                LogLevel::WARNING,
1437
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1438
            );
1439
        }
1440
1441
        SignalSlotUtility::emitSignal(
1442
            __CLASS__,
1443
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1444
            [$queueId, &$queueRec]
1445
        );
1446
1447
        // Set exec_time to lock record:
1448
        $field_array = ['exec_time' => $this->getCurrentTime()];
1449
1450
        if (isset($this->processID)) {
1451
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1452
            $field_array['process_id_completed'] = $this->processID;
1453
        }
1454
1455
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1456
            ->update(
1457
                'tx_crawler_queue',
1458
                $field_array,
1459
                [ 'qid' => (int)$queueId ]
1460
            );
1461
1462
        $result = $this->readUrl_exec($queueRec);
1463
        $resultData = unserialize($result['content']);
1464
1465
        //atm there's no need to point to specific pollable extensions
1466
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1467
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1468
                // only check the success value if the instruction is runnig
1469
                // it is important to name the pollSuccess key same as the procInstructions key
1470
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1471
                    $pollable,
1472
                    $resultData['parameters']['procInstructions']
1473
                )
1474
                ) {
1475
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1476
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1477
                    }
1478
                }
1479
            }
1480
        }
1481
1482
        // Set result in log which also denotes the end of the processing of this entry.
1483
        $field_array = ['result_data' => serialize($result)];
1484
1485
        SignalSlotUtility::emitSignal(
1486
            __CLASS__,
1487
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1488
            [$queueId, &$field_array]
1489
        );
1490
1491
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1492
            ->update(
1493
                'tx_crawler_queue',
1494
                $field_array,
1495
                [ 'qid' => (int)$queueId ]
1496
            );
1497
1498
        if ($this->debugMode) {
1499
            $this->getLogger()->log(
1500
                LogLevel::DEBUG,
1501
                'crawler-readurl stop ' . microtime(true)
1502
            );
1503
        }
1504
1505
        return $ret;
1506
    }
1507
1508
    /**
1509
     * Read URL for not-yet-inserted log-entry
1510
     *
1511
     * @param array $field_array Queue field array,
1512
     *
1513
     * @return string
1514
     */
1515
    public function readUrlFromArray($field_array)
1516
    {
1517
1518
            // Set exec_time to lock record:
1519
        $field_array['exec_time'] = $this->getCurrentTime();
1520
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1521
        $connectionForCrawlerQueue->insert(
1522
            'tx_crawler_queue',
1523
            $field_array
1524
        );
1525
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1526
1527
        $result = $this->readUrl_exec($field_array);
1528
1529
        // Set result in log which also denotes the end of the processing of this entry.
1530
        $field_array = ['result_data' => serialize($result)];
1531
1532
        SignalSlotUtility::emitSignal(
1533
            __CLASS__,
1534
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1535
            [$queueId, &$field_array]
1536
        );
1537
1538
        $connectionForCrawlerQueue->update(
1539
            'tx_crawler_queue',
1540
            $field_array,
1541
            ['qid' => $queueId]
1542
        );
1543
1544
        return $result;
1545
    }
1546
1547
    /**
1548
     * Read URL for a queue record
1549
     *
1550
     * @param array $queueRec Queue record
1551
     * @return string
1552
     */
1553
    public function readUrl_exec($queueRec)
1554
    {
1555
        // Decode parameters:
1556
        $parameters = unserialize($queueRec['parameters']);
1557
        $result = 'ERROR';
1558
        if (is_array($parameters)) {
1559
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1560
                $objRef = $parameters['_CALLBACKOBJ'];
1561
                $callBackObj = GeneralUtility::makeInstance($objRef);
1562
                if (is_object($callBackObj)) {
1563
                    unset($parameters['_CALLBACKOBJ']);
1564
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1565
                } else {
1566
                    $result = ['content' => 'No object: ' . $objRef];
1567
                }
1568
            } else { // Regular FE request:
1569
1570
                // Prepare:
1571
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1572
1573
                // Get result:
1574
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1575
1576
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1577
            }
1578
        }
1579
1580
        return $result;
1581
    }
1582
1583
    /**
1584
     * Gets the content of a URL.
1585
     *
1586
     * @param string $originalUrl URL to read
1587
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1588
     * @param integer $timeout Timeout time
1589
     * @param integer $recursion Recursion limiter for 302 redirects
1590
     * @return array|boolean
1591
     */
1592 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1593
    {
1594 2
        if (!$recursion) {
1595
            return false;
1596
        }
1597
1598
        // Parse URL, checking for scheme:
1599 2
        $url = parse_url($originalUrl);
1600
1601 2
        if ($url === false) {
1602
            $this->getLogger()->log(
1603
                LogLevel::DEBUG,
1604
                sprintf('Could not parse_url() for string "%s"', $url),
1605
                ['crawlerId' => $crawlerId]
1606
            );
1607
            return false;
1608
        }
1609
1610 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1611
            $this->getLogger()->log(
1612
                LogLevel::DEBUG,
1613
                sprintf('Scheme does not match for url "%s"', $url),
1614
                ['crawlerId' => $crawlerId]
1615
            );
1616
            return false;
1617
        }
1618
1619
        // direct request
1620 2
        if ($this->extensionSettings['makeDirectRequests']) {
1621 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1622 2
            return $result;
1623
        }
1624
1625
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1626
1627
        // thanks to Pierrick Caillon for adding proxy support
1628
        $rurl = $url;
1629
1630
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1631
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1632
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1633
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1634
        }
1635
1636
        $host = $rurl['host'];
1637
1638
        if ($url['scheme'] == 'https') {
1639
            $host = 'ssl://' . $host;
1640
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1641
        } else {
1642
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1643
        }
1644
1645
        $startTime = microtime(true);
1646
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1647
1648
        if (!$fp) {
1649
            $this->getLogger()->log(
1650
                LogLevel::DEBUG,
1651
                sprintf('Error while opening "%s"', $url),
1652
                ['crawlerId' => $crawlerId]
1653
            );
1654
            return false;
1655
        } else {
1656
            // Request message:
1657
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1658
            fputs($fp, $msg);
1659
1660
            // Read response:
1661
            $d = $this->getHttpResponseFromStream($fp);
1662
            fclose($fp);
1663
1664
            $time = microtime(true) - $startTime;
1665
            $this->log($originalUrl . ' ' . $time);
1666
1667
            // Implode content and headers:
1668
            $result = [
1669
                'request' => $msg,
1670
                'headers' => implode('', $d['headers']),
1671
                'content' => implode('', (array)$d['content'])
1672
            ];
1673
1674
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1675
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1674 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1676
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1674 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1677
1678
                if (is_array($newRequestUrl)) {
1679
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1680
                } else {
1681
                    $this->getLogger()->log(
1682
                        LogLevel::DEBUG,
1683
                        sprintf('Error while opening "%s"', $url),
1684
                        ['crawlerId' => $crawlerId]
1685
                    );
1686
                    return false;
1687
                }
1688
            }
1689
1690
            return $result;
1691
        }
1692
    }
1693
1694
    /**
1695
     * Gets the base path of the website frontend.
1696
     * (e.g. if you call http://mydomain.com/cms/index.php in
1697
     * the browser the base path is "/cms/")
1698
     *
1699
     * @return string Base path of the website frontend
1700
     */
1701
    protected function getFrontendBasePath()
1702
    {
1703
        $frontendBasePath = '/';
1704
1705
        // Get the path from the extension settings:
1706
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1707
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1708
        // If empty, try to use config.absRefPrefix:
1709
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1710
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1711
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1712
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1713
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1714
        }
1715
1716
        // Base path must be '/<pathSegements>/':
1717
        if ($frontendBasePath !== '/') {
1718
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1719
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1720
        }
1721
1722
        return $frontendBasePath;
1723
    }
1724
1725
    /**
1726
     * Executes a shell command and returns the outputted result.
1727
     *
1728
     * @param string $command Shell command to be executed
1729
     * @return string Outputted result of the command execution
1730
     */
1731
    protected function executeShellCommand($command)
1732
    {
1733
        return shell_exec($command);
1734
    }
1735
1736
    /**
1737
     * Reads HTTP response from the given stream.
1738
     *
1739
     * @param  resource $streamPointer  Pointer to connection stream.
1740
     * @return array                    Associative array with the following items:
1741
     *                                  headers <array> Response headers sent by server.
1742
     *                                  content <array> Content, with each line as an array item.
1743
     */
1744 1
    protected function getHttpResponseFromStream($streamPointer)
1745
    {
1746 1
        $response = ['headers' => [], 'content' => []];
1747
1748 1
        if (is_resource($streamPointer)) {
1749
            // read headers
1750 1
            while ($line = fgets($streamPointer, '2048')) {
1751 1
                $line = trim($line);
1752 1
                if ($line !== '') {
1753 1
                    $response['headers'][] = $line;
1754
                } else {
1755 1
                    break;
1756
                }
1757
            }
1758
1759
            // read content
1760 1
            while ($line = fgets($streamPointer, '2048')) {
1761 1
                $response['content'][] = $line;
1762
            }
1763
        }
1764
1765 1
        return $response;
1766
    }
1767
1768
    /**
1769
     * @param message
1770
     */
1771 2
    protected function log($message)
1772
    {
1773 2
        if (!empty($this->extensionSettings['logFileName'])) {
1774
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1775
            if (!$fileResult) {
1776
                $this->getLogger()->log(
1777
                    LogLevel::INFO,
1778
                    sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1779
                );
1780
            }
1781
        }
1782 2
    }
1783
1784
    /**
1785
     * Builds HTTP request headers.
1786
     *
1787
     * @param array $url
1788
     * @param string $crawlerId
1789
     *
1790
     * @return array
1791
     */
1792 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1793
    {
1794 6
        $reqHeaders = [];
1795 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1796 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1797 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1798 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1799
        }
1800 6
        $reqHeaders[] = 'Connection: close';
1801 6
        if ($url['user'] != '') {
1802 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1803
        }
1804 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1805 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1806 6
        return $reqHeaders;
1807
    }
1808
1809
    /**
1810
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1811
     *
1812
     * @param array $headers HTTP Header
1813
     * @param string $user HTTP Auth. User
1814
     * @param string $pass HTTP Auth. Password
1815
     * @return bool|string
1816
     */
1817 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1818
    {
1819 12
        $header = [];
1820 12
        if (!is_array($headers)) {
1821 1
            return false;
1822
        }
1823 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1824 2
            return false;
1825
        }
1826
1827 9
        foreach ($headers as $hl) {
1828 9
            $tmp = explode(": ", $hl);
1829 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1830 9
            if (trim($tmp[0]) == 'Location') {
1831 6
                break;
1832
            }
1833
        }
1834 9
        if (!array_key_exists('Location', $header)) {
1835 3
            return false;
1836
        }
1837
1838 6
        if ($user != '') {
1839 3
            if (!($tmp = parse_url($header['Location']))) {
1840 1
                return false;
1841
            }
1842 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1843 2
            if ($tmp['query'] != '') {
1844 2
                $newUrl .= '?' . $tmp['query'];
1845
            }
1846
        } else {
1847 3
            $newUrl = $header['Location'];
1848
        }
1849 5
        return $newUrl;
1850
    }
1851
1852
    /**************************
1853
     *
1854
     * tslib_fe hooks:
1855
     *
1856
     **************************/
1857
1858
    /**
1859
     * Initialization hook (called after database connection)
1860
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1861
     *
1862
     * @param array $params Parameters from frontend
1863
     * @param object $ref TSFE object (reference under PHP5)
1864
     * @return void
1865
     *
1866
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1867
     * FIXME: I think this can be removed. (TNM)
1868
     */
1869
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1870
    {
1871
        // Authenticate crawler request:
1872
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1873
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1874
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1875
1876
            $queueRec = $queryBuilder
1877
                ->select('*')
1878
                ->from('tx_crawler_queue')
1879
                ->where(
1880
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1881
                )
1882
                ->execute()
1883
                ->fetch();
1884
1885
            // If a crawler record was found and hash was matching, set it up:
1886
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1887
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1888
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1889
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1890
            } else {
1891
                die('No crawler entry found!');
1892
            }
1893
        }
1894
    }
1895
1896
    /*****************************
1897
     *
1898
     * Compiling URLs to crawl - tools
1899
     *
1900
     *****************************/
1901
1902
    /**
1903
     * @param integer $id Root page id to start from.
1904
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1905
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1906
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1907
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1908
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1909
     * @param array $incomingProcInstructions Array of processing instructions
1910
     * @param array $configurationSelection Array of configuration keys
1911
     * @return string
1912
     */
1913
    public function getPageTreeAndUrls(
1914
        $id,
1915
        $depth,
1916
        $scheduledTime,
1917
        $reqMinute,
1918
        $submitCrawlUrls,
1919
        $downloadCrawlUrls,
1920
        array $incomingProcInstructions,
1921
        array $configurationSelection
1922
    ) {
1923
        global $BACK_PATH;
1924
        global $LANG;
1925
        if (!is_object($LANG)) {
1926
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1927
            $LANG->init(0);
1928
        }
1929
        $this->scheduledTime = $scheduledTime;
1930
        $this->reqMinute = $reqMinute;
1931
        $this->submitCrawlUrls = $submitCrawlUrls;
1932
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1933
        $this->incomingProcInstructions = $incomingProcInstructions;
1934
        $this->incomingConfigurationSelection = $configurationSelection;
1935
1936
        $this->duplicateTrack = [];
1937
        $this->downloadUrls = [];
1938
1939
        // Drawing tree:
1940
        /* @var PageTreeView $tree */
1941
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1942
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1943
        $tree->init('AND ' . $perms_clause);
1944
1945
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1946
        if (is_array($pageInfo)) {
1947
            // Set root row:
1948
            $tree->tree[] = [
1949
                'row' => $pageInfo,
1950
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1951
            ];
1952
        }
1953
1954
        // Get branch beneath:
1955
        if ($depth) {
1956
            $tree->getTree($id, $depth, '');
1957
        }
1958
1959
        // Traverse page tree:
1960
        $code = '';
1961
1962
        foreach ($tree->tree as $data) {
1963
            $this->MP = false;
1964
1965
            // recognize mount points
1966
            if ($data['row']['doktype'] == 7) {
1967
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1968
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1969
                $mountpage = $queryBuilder
1970
                    ->select('*')
1971
                    ->from('pages')
1972
                    ->where(
1973
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1974
                    )
1975
                    ->execute()
1976
                    ->fetchAll();
1977
                $queryBuilder->getRestrictions()->reset();
1978
1979
                // fetch mounted pages
1980
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1981
1982
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1983
                $mountTree->init('AND ' . $perms_clause);
1984
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1985
1986
                foreach ($mountTree->tree as $mountData) {
1987
                    $code .= $this->drawURLs_addRowsForPage(
1988
                        $mountData['row'],
1989
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1990
                    );
1991
                }
1992
1993
                // replace page when mount_pid_ol is enabled
1994
                if ($mountpage[0]['mount_pid_ol']) {
1995
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1996
                } else {
1997
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1998
                    $this->MP = false;
1999
                }
2000
            }
2001
2002
            $code .= $this->drawURLs_addRowsForPage(
2003
                $data['row'],
2004
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
2005
            );
2006
        }
2007
2008
        return $code;
2009
    }
2010
2011
    /**
2012
     * Expands exclude string
2013
     *
2014
     * @param string $excludeString Exclude string
2015
     * @return array
2016
     */
2017 1
    public function expandExcludeString($excludeString)
2018
    {
2019
        // internal static caches;
2020 1
        static $expandedExcludeStringCache;
2021 1
        static $treeCache;
2022
2023 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
2024 1
            $pidList = [];
2025
2026 1
            if (!empty($excludeString)) {
2027
                /** @var PageTreeView $tree */
2028
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
2029
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
2030
2031
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2032
2033
                foreach ($excludeParts as $excludePart) {
2034
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2035
2036
                    // default is "page only" = "depth=0"
2037
                    if (empty($depth)) {
2038
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2039
                    }
2040
2041
                    $pidList[] = $pid;
2042
2043
                    if ($depth > 0) {
2044
                        if (empty($treeCache[$pid][$depth])) {
2045
                            $tree->reset();
2046
                            $tree->getTree($pid, $depth);
2047
                            $treeCache[$pid][$depth] = $tree->tree;
2048
                        }
2049
2050
                        foreach ($treeCache[$pid][$depth] as $data) {
2051
                            $pidList[] = $data['row']['uid'];
2052
                        }
2053
                    }
2054
                }
2055
            }
2056
2057 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2058
        }
2059
2060 1
        return $expandedExcludeStringCache[$excludeString];
2061
    }
2062
2063
    /**
2064
     * Create the rows for display of the page tree
2065
     * For each page a number of rows are shown displaying GET variable configuration
2066
     *
2067
     * @param    array        Page row
2068
     * @param    string        Page icon and title for row
2069
     * @return    string        HTML <tr> content (one or more)
2070
     */
2071
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2072
    {
2073
        $skipMessage = '';
2074
2075
        // Get list of configurations
2076
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2077
2078
        if (count($this->incomingConfigurationSelection) > 0) {
2079
            // remove configuration that does not match the current selection
2080
            foreach ($configurations as $confKey => $confArray) {
2081
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2082
                    unset($configurations[$confKey]);
2083
                }
2084
            }
2085
        }
2086
2087
        // Traverse parameter combinations:
2088
        $c = 0;
2089
        $content = '';
2090
        if (count($configurations)) {
2091
            foreach ($configurations as $confKey => $confArray) {
2092
2093
                    // Title column:
2094
                if (!$c) {
2095
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2096
                } else {
2097
                    $titleClm = '';
2098
                }
2099
2100
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2101
2102
                        // URL list:
2103
                    $urlList = $this->urlListFromUrlArray(
2104
                        $confArray,
2105
                        $pageRow,
2106
                        $this->scheduledTime,
2107
                        $this->reqMinute,
2108
                        $this->submitCrawlUrls,
2109
                        $this->downloadCrawlUrls,
2110
                        $this->duplicateTrack,
2111
                        $this->downloadUrls,
2112
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2113
                    );
2114
2115
                    // Expanded parameters:
2116
                    $paramExpanded = '';
2117
                    $calcAccu = [];
2118
                    $calcRes = 1;
2119
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2120
                        $paramExpanded .= '
2121
                            <tr>
2122
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2123
                                                '(' . count($gVal) . ')' .
2124
                                                '</td>
2125
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2126
                            </tr>
2127
                        ';
2128
                        $calcRes *= count($gVal);
2129
                        $calcAccu[] = count($gVal);
2130
                    }
2131
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2132
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2133
2134
                    // Options
2135
                    $optionValues = '';
2136
                    if ($confArray['subCfg']['userGroups']) {
2137
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2138
                    }
2139
                    if ($confArray['subCfg']['baseUrl']) {
2140
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2141
                    }
2142
                    if ($confArray['subCfg']['procInstrFilter']) {
2143
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2144
                    }
2145
2146
                    // Compile row:
2147
                    $content .= '
2148
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2149
                            ' . $titleClm . '
2150
                            <td>' . htmlspecialchars($confKey) . '</td>
2151
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2152
                            <td>' . $paramExpanded . '</td>
2153
                            <td nowrap="nowrap">' . $urlList . '</td>
2154
                            <td nowrap="nowrap">' . $optionValues . '</td>
2155
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2156
                        </tr>';
2157
                } else {
2158
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2159
                            ' . $titleClm . '
2160
                            <td>' . htmlspecialchars($confKey) . '</td>
2161
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2162
                        </tr>';
2163
                }
2164
2165
                $c++;
2166
            }
2167
        } else {
2168
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2169
2170
            // Compile row:
2171
            $content .= '
2172
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2173
                    <td>' . $pageTitleAndIcon . '</td>
2174
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2175
                </tr>';
2176
        }
2177
2178
        return $content;
2179
    }
2180
2181
    /*****************************
2182
     *
2183
     * CLI functions
2184
     *
2185
     *****************************/
2186
2187
    /**
2188
     * Helper function
2189
     *
2190
     * @param string $option Option string, eg. "-s
2191
     * @param int $idx Value index, default is 0 (zero) = the first one...
2192
     * @return string
2193
     */
2194
    private function cli_argValue($option, $idx)
2195
    {
2196
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
0 ignored issues
show
Bug introduced by
The property cli_args does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
2197
    }
2198
2199
    /**
2200
     * Helper function
2201
     *
2202
     * @param string $string The string to output
2203
     */
2204
    private function cli_echo($string)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2205
    {
2206
        $this->outputLine($string);
0 ignored issues
show
Bug introduced by
The method outputLine() does not seem to exist on object<AOE\Crawler\Controller\CrawlerController>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2207
    }
2208
2209
    /**
2210
     * Set cli args
2211
     *
2212
     * This is a copy from the CommandLineController from TYPO3 < v9
2213
     *
2214
     * TODO: Rework
2215
     *
2216
     * @param array $argv
2217
     */
2218
    private function setCliArgs(array $argv)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2219
    {
2220
        $cli_options = [];
2221
        $index = '_DEFAULT';
2222
        foreach ($argv as $token) {
2223
            // Options starting with a number is invalid - they could be negative values!
2224
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2225
                list($index, $opt) = explode('=', $token, 2);
2226
                if (isset($cli_options[$index])) {
2227
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2228
                    die;
2229
                }
2230
                $cli_options[$index] = [];
2231
                if (isset($opt)) {
2232
                    $cli_options[$index][] = $opt;
2233
                }
2234
            } else {
2235
                $cli_options[$index][] = $token;
2236
            }
2237
        }
2238
2239
        $this->cliArgs = $cli_options;
2240
    }
2241
2242
    /**
2243
     * Obtains configuration keys from the CLI arguments
2244
     *
2245
     * @return mixed                        Array of keys or null if no keys found
2246
     */
2247
    protected function getConfigurationKeys()
2248
    {
2249
        $parameter = trim($this->cli_argValue('-conf'));
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2250
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2251
    }
2252
2253
    /**
2254
     * Running the functionality of the CLI (crawling URLs from queue)
2255
     *
2256
     * @param int $countInARun
2257
     * @param int $sleepTime
2258
     * @param int $sleepAfterFinish
2259
     * @return string
2260
     */
2261
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2262
    {
2263
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2264
        $result = 0;
2265
        $counter = 0;
2266
2267
        // First, run hooks:
2268
        $this->CLI_runHooks();
2269
2270
        // Clean up the queue
2271
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2272
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2273
2274
            $del = $queryBuilder
2275
                ->delete($this->tableName)
2276
                ->where(
2277
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2278
                );
2279
            if (false == $del) {
2280
                $this->getLogger()->log(
2281
                    LogLevel::INFO,
2282
                    'Records could not be deleted.'
2283
                );
2284
            }
2285
        }
2286
2287
        // Select entries:
2288
        //TODO Shouldn't this reside within the transaction?
2289
        $rows = $queryBuilder
2290
            ->select('qid', 'scheduled')
2291
            ->from('tx_crawler_queue')
2292
            ->where(
2293
                $queryBuilder->expr()->eq('exec_time', 0),
2294
                $queryBuilder->expr()->eq('process_scheduled', 0),
2295
                $queryBuilder->expr()->lte('scheduled', $this->getCurrentTime())
2296
            )
2297
            ->orderBy('scheduled')
2298
            ->addOrderBy('qid')
2299
            ->setMaxResults($countInARun)
2300
            ->execute()
2301
            ->fetchAll();
2302
2303
        if (count($rows) > 0) {
2304
            $quidList = [];
2305
2306
            foreach ($rows as $r) {
2307
                $quidList[] = $r['qid'];
2308
            }
2309
2310
            $processId = $this->CLI_buildProcessId();
2311
2312
            //reserve queue entries for process
2313
2314
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2315
            //TODO make sure we're not taking assigned queue-entires
2316
2317
            //save the number of assigned queue entrys to determine who many have been processed later
2318
            $numberOfAffectedRows = $queryBuilder
2319
                ->update('tx_crawler_queue')
2320
                ->where(
2321
                    $queryBuilder->expr()->in('qid', $quidList)
2322
                )
2323
                ->set('process_scheduled', $queryBuilder->createNamedParamter($this->getCurrentTime(), \PDO::PARAM_INT))
2324
                ->set('process_id', $processId)
2325
                ->execute();
2326
2327
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2328
                ->update(
2329
                    'tx_crawler_process',
2330
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2331
                    [ 'process_id' => (int) $processId ]
2332
                );
2333
2334
            if ($numberOfAffectedRows == count($quidList)) {
2335
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2336
            } else {
2337
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2338
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2339
                return ($result | self::CLI_STATUS_ABORTED);
2340
            }
2341
2342
            foreach ($rows as $r) {
2343
                $result |= $this->readUrl($r['qid']);
2344
2345
                $counter++;
2346
                usleep(intval($sleepTime)); // Just to relax the system
2347
2348
                // if during the start and the current read url the cli has been disable we need to return from the function
2349
                // mark the process NOT as ended.
2350
                if ($this->getDisabled()) {
2351
                    return ($result | self::CLI_STATUS_ABORTED);
2352
                }
2353
2354
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2355
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2356
2357
                    //TODO might need an additional returncode
2358
                    $result |= self::CLI_STATUS_ABORTED;
2359
                    break; //possible timeout
2360
                }
2361
            }
2362
2363
            sleep(intval($sleepAfterFinish));
2364
2365
            $msg = 'Rows: ' . $counter;
2366
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2367
        } else {
2368
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2369
        }
2370
2371
        if ($counter > 0) {
2372
            $result |= self::CLI_STATUS_PROCESSED;
2373
        }
2374
2375
        return $result;
2376
    }
2377
2378
    /**
2379
     * Activate hooks
2380
     *
2381
     * @return void
2382
     */
2383
    public function CLI_runHooks()
2384
    {
2385
        global $TYPO3_CONF_VARS;
2386
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2387
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2388
                $hookObj = GeneralUtility::makeInstance($objRef);
2389
                if (is_object($hookObj)) {
2390
                    $hookObj->crawler_init($this);
2391
                }
2392
            }
2393
        }
2394
    }
2395
2396
    /**
2397
     * Try to acquire a new process with the given id
2398
     * also performs some auto-cleanup for orphan processes
2399
     * @todo preemption might not be the most elegant way to clean up
2400
     *
2401
     * @param string $id identification string for the process
2402
     * @return boolean
2403
     */
2404
    public function CLI_checkAndAcquireNewProcess($id)
2405
    {
2406
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2407
        $ret = true;
2408
2409
        $systemProcessId = getmypid();
2410
        if ($systemProcessId < 1) {
2411
            return false;
2412
        }
2413
2414
        $processCount = 0;
2415
        $orphanProcesses = [];
2416
2417
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2418
2419
        $statement = $queryBuilder
2420
            ->select('process_id', 'ttl')
2421
            ->from('tx_crawler_process')
2422
            ->where(
2423
                'active = 1 AND deleted = 0'
2424
            )
2425
            ->execute();
2426
2427
        $currentTime = $this->getCurrentTime();
2428
2429
        while ($row = $statement->fetch()) {
2430
            if ($row['ttl'] < $currentTime) {
2431
                $orphanProcesses[] = $row['process_id'];
2432
            } else {
2433
                $processCount++;
2434
            }
2435
        }
2436
2437
        // if there are less than allowed active processes then add a new one
2438
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2439
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2440
2441
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2442
                'tx_crawler_process',
2443
                [
2444
                    'process_id' => $id,
2445
                    'active' => 1,
2446
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2447
                    'system_process_id' => $systemProcessId
2448
                ]
2449
            );
2450
        } else {
2451
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2452
            $ret = false;
2453
        }
2454
2455
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2456
        $this->CLI_deleteProcessesMarkedDeleted();
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...rocessesMarkedDeleted() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.
Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2457
2458
        //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2459
2460
        return $ret;
2461
    }
2462
2463
    /**
2464
     * Release a process and the required resources
2465
     *
2466
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2467
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2468
     * @return boolean
2469
     */
2470
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2471
    {
2472
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2473
2474
        if (!is_array($releaseIds)) {
2475
            $releaseIds = [$releaseIds];
2476
        }
2477
2478
        if (!(count($releaseIds) > 0)) {
2479
            return false;   //nothing to release
2480
        }
2481
2482
        if (!$withinLock) {
2483
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2484
        }
2485
2486
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2487
        // this ensures that a single process can't mess up the entire process table
2488
2489
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2490
2491
        $queryBuilder
2492
        ->update('tx_crawler_queue', 'q')
2493
        ->where(
2494
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2495
        )
2496
        ->set('q.process_scheduled', 0)
2497
        ->set('q.process_id', '')
2498
        ->execute();
2499
2500
        // FIXME: Not entirely sure that this is equivalent to the previous version
2501
        $queryBuilder->resetQueryPart('set');
2502
2503
        $queryBuilder
2504
            ->update('tx_crawler_process')
2505
            ->where(
2506
                $queryBuilder->expr()->eq('active', 0),
2507
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2508
            )
2509
            ->set('system_process_id', 0)
2510
            ->execute();
2511
        // previous version for reference
2512
        /*
2513
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2514
            'tx_crawler_process',
2515
            'active=0 AND deleted=0
2516
            AND NOT EXISTS (
2517
                SELECT * FROM tx_crawler_queue
2518
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2519
                AND tx_crawler_queue.exec_time = 0
2520
            )',
2521
            [
2522
                'deleted' => '1',
2523
                'system_process_id' => 0
2524
            ]
2525
        );*/
2526
        // mark all requested processes as non-active
2527
        $queryBuilder
2528
            ->update('tx_crawler_process')
2529
            ->where(
2530
                'NOT EXISTS (
2531
                SELECT * FROM tx_crawler_queue
2532
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2533
                    AND tx_crawler_queue.exec_time = 0
2534
                )',
2535
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2536
                $queryBuilder->expr()->eq('deleted', 0)
2537
            )
2538
            ->set('active', 0)
2539
            ->execute();
2540
        $queryBuilder->resetQueryPart('set');
2541
        $queryBuilder
2542
            ->update('tx_crawler_queue')
2543
            ->where(
2544
                $queryBuilder->expr()->eq('exec_time', 0),
2545
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2546
            )
2547
            ->set('process_scheduled', 0)
2548
            ->set('process_id', '')
2549
            ->execute();
2550
2551
        if (!$withinLock) {
2552
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2553
        }
2554
2555
        return true;
2556
    }
2557
2558
    /**
2559
     * Delete processes marked as deleted
2560
     *
2561
     * @return void
2562
     *
2563
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
2564
     * Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()
2565
     */
2566 1
    public function CLI_deleteProcessesMarkedDeleted()
2567
    {
2568 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2569
        $queryBuilder
2570 1
            ->delete('tx_crawler_process')
2571 1
            ->where('deleted = 1')
2572 1
            ->execute();
2573 1
    }
2574
2575
    /**
2576
     * Check if there are still resources left for the process with the given id
2577
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2578
     *
2579
     * @param  string  identification string for the process
2580
     * @return boolean determines if the process is still active / has resources
2581
     *
2582
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2583
     */
2584 1
    public function CLI_checkIfProcessIsActive($pid)
2585
    {
2586 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2587 1
        $ret = false;
2588
2589
        $statement = $queryBuilder
2590 1
            ->from('tx_crawler_process')
2591 1
            ->select('active')
2592 1
            ->where(
2593 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2594
            )
2595 1
            ->orderBy('ttl')
2596 1
            ->execute();
2597
2598 1
        if ($row = $statement->fetch(0)) {
2599 1
            $ret = intVal($row['active']) == 1;
2600
        }
2601
2602 1
        return $ret;
2603
    }
2604
2605
    /**
2606
     * Create a unique Id for the current process
2607
     *
2608
     * @return string  the ID
2609
     */
2610 2
    public function CLI_buildProcessId()
2611
    {
2612 2
        if (!$this->processID) {
2613 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2614
        }
2615 2
        return $this->processID;
2616
    }
2617
2618
    /**
2619
     * @param bool $get_as_float
2620
     *
2621
     * @return mixed
2622
     */
2623
    protected function microtime($get_as_float = false)
2624
    {
2625
        return microtime($get_as_float);
2626
    }
2627
2628
    /**
2629
     * Prints a message to the stdout (only if debug-mode is enabled)
2630
     *
2631
     * @param  string $msg  the message
2632
     */
2633
    public function CLI_debug($msg)
2634
    {
2635
        if (intval($this->extensionSettings['processDebug'])) {
2636
            echo $msg . "\n";
2637
            flush();
2638
        }
2639
    }
2640
2641
    /**
2642
     * Get URL content by making direct request to TYPO3.
2643
     *
2644
     * @param  string $url          Page URL
2645
     * @param  int    $crawlerId    Crawler-ID
2646
     * @return array
2647
     */
2648 2
    protected function sendDirectRequest($url, $crawlerId)
2649
    {
2650 2
        $parsedUrl = parse_url($url);
2651 2
        if (!is_array($parsedUrl)) {
2652
            return [];
2653
        }
2654
2655 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2656
2657 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2658 2
        $cmd .= ' ';
2659 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2660 2
        $cmd .= ' ';
2661 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2662 2
        $cmd .= ' ';
2663 2
        $cmd .= escapeshellarg($url);
2664 2
        $cmd .= ' ';
2665 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2666
2667 2
        $startTime = microtime(true);
2668 2
        $content = $this->executeShellCommand($cmd);
2669 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2670
2671
        $result = [
2672 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2673 2
            'headers' => '',
2674 2
            'content' => $content
2675
        ];
2676
2677 2
        return $result;
2678
    }
2679
2680
    /**
2681
     * Cleans up entries that stayed for too long in the queue. These are:
2682
     * - processed entries that are over 1.5 days in age
2683
     * - scheduled entries that are over 7 days old
2684
     *
2685
     * @return void
2686
     */
2687
    public function cleanUpOldQueueEntries()
2688
    {
2689
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2690
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2691
2692
        $now = time();
2693
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2694
        $this->flushQueue($condition);
2695
    }
2696
2697
    /**
2698
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2699
     *
2700
     * @param int $id
2701
     * @param int $typeNum
2702
     *
2703
     * @return void
2704
     */
2705
    protected function initTSFE($id = 1, $typeNum = 0)
2706
    {
2707
        EidUtility::initTCA();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Utility\EidUtility::initTCA() has been deprecated with message: since TYPO3 v9.4, will be removed in TYPO3 v10.0. Is not needed anymore within eID scripts as TCA is now available at any time

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2708
        if (!is_object($GLOBALS['TT'])) {
2709
            $GLOBALS['TT'] = new TimeTracker(false);
2710
            $GLOBALS['TT']->start();
2711
        }
2712
2713
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2714
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2715
        $GLOBALS['TSFE']->sys_page->init(true);
2716
        $GLOBALS['TSFE']->initFEuser();
2717
        $GLOBALS['TSFE']->determineId();
2718
        $GLOBALS['TSFE']->initTemplate();
2719
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2720
        $GLOBALS['TSFE']->getConfigArray();
2721
    }
2722
2723
    /**
2724
     * Returns a md5 hash generated from a serialized configuration array.
2725
     *
2726
     * @param array $configuration
2727
     *
2728
     * @return string
2729
     */
2730 7
    protected function getConfigurationHash(array $configuration)
2731
    {
2732 7
        unset($configuration['paramExpanded']);
2733 7
        unset($configuration['URLs']);
2734 7
        return md5(serialize($configuration));
2735
    }
2736
2737
    /**
2738
     * Check whether the Crawling Protocol should be http or https
2739
     *
2740
     * @param $crawlerConfiguration
2741
     * @param $pageConfiguration
2742
     *
2743
     * @return bool
2744
     */
2745 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration)
2746
    {
2747 6
        switch ($crawlerConfiguration) {
2748
            case -1:
2749 1
                return false;
2750 5
            case 0:
2751 3
                return $pageConfiguration;
2752 2
            case 1:
2753 1
                return true;
2754
            default:
2755 1
                return false;
2756
        }
2757
    }
2758
}
2759