Completed
Push — typo3v9 ( 43e1a1...658720 )
by Tomas Norre
06:14
created

CrawlerController::setExtensionSettings()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
ccs 3
cts 3
cp 1
crap 1
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
35
use TYPO3\CMS\Backend\Utility\BackendUtility;
36
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
37
use TYPO3\CMS\Core\Database\Connection;
38
use TYPO3\CMS\Core\Database\ConnectionPool;
39
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
40
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
41
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
42
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
43
use TYPO3\CMS\Core\Log\Logger;
44
use TYPO3\CMS\Core\Log\LogLevel;
45
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
46
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
47
use TYPO3\CMS\Core\Utility\DebugUtility;
48
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
49
use TYPO3\CMS\Core\Utility\GeneralUtility;
50
use TYPO3\CMS\Core\Utility\MathUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
use TYPO3\CMS\Frontend\Utility\EidUtility;
55
use TYPO3\CMS\Lang\LanguageService;
56
57
/**
58
 * Class CrawlerController
59
 *
60
 * @package AOE\Crawler\Controller
61
 */
62
class CrawlerController
63
{
64
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
65
    const CLI_STATUS_REMAIN = 1; //queue not empty
66
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
67
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
68
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
69
70
    /**
71
     * @var integer
72
     */
73
    public $setID = 0;
74
75
    /**
76
     * @var string
77
     */
78
    public $processID = '';
79
80
    /**
81
     * One hour is max stalled time for the CLI
82
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
83
     *
84
     * @var integer
85
     */
86
    public $max_CLI_exec_time = 3600;
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var boolean
125
     */
126
    public $debugMode = false;
127
128
    /**
129
     * @var array
130
     */
131
    public $extensionSettings = [];
132
133
    /**
134
     * Mount Point
135
     *
136
     * @var boolean
137
     */
138
    public $MP = false;
139
140
    /**
141
     * @var string
142
     */
143
    protected $processFilename;
144
145
    /**
146
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
147
     *
148
     * @var string
149
     */
150
    protected $accessMode;
151
152
    /**
153
     * @var BackendUserAuthentication
154
     */
155
    private $backendUser;
156
157
    /**
158
     * @var integer
159
     */
160
    private $scheduledTime = 0;
161
162
    /**
163
     * @var integer
164
     */
165
    private $reqMinute = 0;
166
167
    /**
168
     * @var bool
169
     */
170
    private $submitCrawlUrls = false;
171
172
    /**
173
     * @var bool
174
     */
175
    private $downloadCrawlUrls = false;
176
177
    /**
178
     * @var QueueRepository
179
     */
180
    protected $queueRepository;
181
182
    /**
183
     * @var ProcessRepository
184
     */
185
    protected $processRepository;
186
187
    /**
188
     * @var string
189
     */
190
    protected $tableName = 'tx_crawler_queue';
191
192
    /**
193
     * @var array
194
     */
195
    private $cliArgs;
196
197
    /**
198
     * @var Logger
199
     */
200
    private $logger;
201
202
    /**
203
     * Method to set the accessMode can be gui, cli or cli_im
204
     *
205
     * @return string
206
     */
207 1
    public function getAccessMode()
208
    {
209 1
        return $this->accessMode;
210
    }
211
212
    /**
213
     * @param string $accessMode
214
     */
215 1
    public function setAccessMode($accessMode)
216
    {
217 1
        $this->accessMode = $accessMode;
218 1
    }
219
220
    /**
221
     * Set disabled status to prevent processes from being processed
222
     *
223
     * @param  bool $disabled (optional, defaults to true)
224
     * @return void
225
     */
226 3
    public function setDisabled($disabled = true)
227
    {
228 3
        if ($disabled) {
229 2
            GeneralUtility::writeFile($this->processFilename, '');
230
        } else {
231 1
            if (is_file($this->processFilename)) {
232 1
                unlink($this->processFilename);
233
            }
234
        }
235 3
    }
236
237
    /**
238
     * Get disable status
239
     *
240
     * @return bool true if disabled
241
     */
242 3
    public function getDisabled()
243
    {
244 3
        return is_file($this->processFilename);
245
    }
246
247
    /**
248
     * @param string $filenameWithPath
249
     *
250
     * @return void
251
     */
252 4
    public function setProcessFilename($filenameWithPath)
253
    {
254 4
        $this->processFilename = $filenameWithPath;
255 4
    }
256
257
    /**
258
     * @return string
259
     */
260 1
    public function getProcessFilename()
261
    {
262 1
        return $this->processFilename;
263
    }
264
265
    /**
266
     * @return Logger
267
     */
268
    private function getLogger(): Logger
269
    {
270
        if ($this->logger === null) {
271
            $this->logger = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Log\LogManager::class)->getLogger(__CLASS__);
272
        }
273
        return $this->logger;
274
    }
275
276
    /************************************
277
     *
278
     * Getting URLs based on Page TSconfig
279
     *
280
     ************************************/
281
282 31
    public function __construct()
283
    {
284 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
285 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
286 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
287
288 31
        $this->backendUser = $GLOBALS['BE_USER'];
289 31
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
290
291
        /** @var ExtensionConfigurationProvider $configurationProvider */
292 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
293 31
        $settings = $configurationProvider->getExtensionConfiguration();
294 31
        $settings = is_array($settings) ? $settings : [];
295
296
        // read ext_em_conf_template settings and set
297 31
        $this->setExtensionSettings($settings);
298
299
        // set defaults:
300 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
301
            $this->extensionSettings['countInARun'] = 100;
302
        }
303
304 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
305 31
    }
306
307
    /**
308
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
309
     *
310
     * @param array $extensionSettings
311
     * @return void
312
     */
313 40
    public function setExtensionSettings(array $extensionSettings)
314
    {
315 40
        $this->extensionSettings = $extensionSettings;
316 40
    }
317
318
    /**
319
     * Check if the given page should be crawled
320
     *
321
     * @param array $pageRow
322
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
323
     */
324 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
325
    {
326 8
        $skipPage = false;
327 8
        $skipMessage = 'Skipped'; // message will be overwritten later
328
329
        // if page is hidden
330 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
331 8
            if ($pageRow['hidden']) {
332 1
                $skipPage = true;
333 1
                $skipMessage = 'Because page is hidden';
334
            }
335
        }
336
337 8
        if (!$skipPage) {
338 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
339 3
                $skipPage = true;
340 3
                $skipMessage = 'Because doktype is not allowed';
341
            }
342
        }
343
344 8
        if (!$skipPage) {
345 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
346 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
347 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
348 1
                        $skipPage = true;
349 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
350 1
                        break;
351
                    }
352
                }
353
            }
354
        }
355
356 8
        if (!$skipPage) {
357
            // veto hook
358 3
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
359
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
360
                    $params = [
361
                        'pageRow' => $pageRow
362
                    ];
363
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
364
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
365
                    if ($veto !== false) {
366
                        $skipPage = true;
367
                        if (is_string($veto)) {
368
                            $skipMessage = $veto;
369
                        } else {
370
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
371
                        }
372
                        // no need to execute other hooks if a previous one return a veto
373
                        break;
374
                    }
375
                }
376
            }
377
        }
378
379 8
        return $skipPage ? $skipMessage : false;
380
    }
381
382
    /**
383
     * Wrapper method for getUrlsForPageId()
384
     * It returns an array of configurations and no urls!
385
     *
386
     * @param array $pageRow Page record with at least dok-type and uid columns.
387
     * @param string $skipMessage
388
     * @return array
389
     * @see getUrlsForPageId()
390
     */
391 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
392
    {
393 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
394
395 4
        if ($message === false) {
396 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
397 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
398 3
            $skipMessage = '';
399
        } else {
400 1
            $skipMessage = $message;
401 1
            $res = [];
402
        }
403
404 4
        return $res;
405
    }
406
407
    /**
408
     * This method is used to count if there are ANY unprocessed queue entries
409
     * of a given page_id and the configuration which matches a given hash.
410
     * If there if none, we can skip an inner detail check
411
     *
412
     * @param  int $uid
413
     * @param  string $configurationHash
414
     * @return boolean
415
     */
416 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
417
    {
418 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
419 5
        $noUnprocessedQueueEntriesFound = true;
420
421
        $result = $queryBuilder
422 5
            ->count('*')
423 5
            ->from($this->tableName)
424 5
            ->where(
425 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
426 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
427 5
                $queryBuilder->expr()->eq('exec_time', 0)
428
            )
429 5
            ->execute()
430 5
            ->fetchColumn();
431
432 5
        if ($result) {
433 3
            $noUnprocessedQueueEntriesFound = false;
434
        }
435
436 5
        return $noUnprocessedQueueEntriesFound;
437
    }
438
439
    /**
440
     * Creates a list of URLs from input array (and submits them to queue if asked for)
441
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
442
     *
443
     * @param    array        Information about URLs from pageRow to crawl.
444
     * @param    array        Page row
445
     * @param    integer        Unix time to schedule indexing to, typically time()
446
     * @param    integer        Number of requests per minute (creates the interleave between requests)
447
     * @param    boolean        If set, submits the URLs to queue
448
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
449
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
450
     * @param    array        Array which will be filled with URLS for download if flag is set.
451
     * @param    array        Array of processing instructions
452
     * @return    string        List of URLs (meant for display in backend module)
453
     *
454
     */
455 2
    public function urlListFromUrlArray(
456
        array $vv,
457
        array $pageRow,
458
        $scheduledTime,
459
        $reqMinute,
460
        $submitCrawlUrls,
461
        $downloadCrawlUrls,
462
        array &$duplicateTrack,
463
        array &$downloadUrls,
464
        array $incomingProcInstructions
465
    ) {
466 2
        $urlList = '';
467
468 2
        if (is_array($vv['URLs'])) {
469 2
            $configurationHash = $this->getConfigurationHash($vv);
470 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
471
472 2
            foreach ($vv['URLs'] as $urlQuery) {
473 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
474
475
                    // Calculate cHash:
476 2
                    if ($vv['subCfg']['cHash']) {
477
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
478
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
479
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
480
                    }
481
482
                    // Create key by which to determine unique-ness:
483 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
484 2
                    $urlQuery = 'index.php' . $urlQuery;
485
486
                    // Scheduled time:
487 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
488 2
                    $schTime = floor($schTime / 60) * 60;
489
490 2
                    if (isset($duplicateTrack[$uKey])) {
491
492
                        //if the url key is registered just display it and do not resubmit is
493
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
494
                    } else {
495 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
496 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
497
498 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
499
500
                        // Submit for crawling!
501 2
                        if ($submitCrawlUrls) {
502 2
                            $added = $this->addUrl(
503 2
                                $pageRow['uid'],
504 2
                                $theUrl,
505 2
                                $vv['subCfg'],
506 2
                                $scheduledTime,
507 2
                                $configurationHash,
508 2
                                $skipInnerCheck
509
                            );
510 2
                            if ($added === false) {
511 2
                                $urlList .= ' (Url already existed)';
512
                            }
513
                        } elseif ($downloadCrawlUrls) {
514
                            $downloadUrls[$theUrl] = $theUrl;
515
                        }
516
517 2
                        $urlList .= '<br />';
518
                    }
519 2
                    $duplicateTrack[$uKey] = true;
520
                }
521
            }
522
        } else {
523
            $urlList = 'ERROR - no URL generated';
524
        }
525
526 2
        return $urlList;
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 5
        if (empty($incomingProcInstructions)) {
539 1
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 2
                return true;
545
            }
546
        }
547 2
    }
548
549 2
    public function getPageTSconfigForId($id)
550
    {
551 2
        if (!$this->MP) {
552 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            list(, $mountPointId) = explode('-', $this->MP);
555
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
556
        }
557
558
        // Call a hook to alter configuration
559 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
560
            $params = [
561
                'pageId' => $id,
562
                'pageTSConfig' => &$pageTSconfig
563
            ];
564
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
565
                GeneralUtility::callUserFunction($userFunc, $params, $this);
566
            }
567
        }
568
569 2
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * And no urls!
575
     *
576
     * @param integer $id Page ID
577
     * @param bool $forceSsl Use https
578
     * @return array
579
     */
580 2
    public function getUrlsForPageId($id, $forceSsl = false)
581
    {
582
583
        /**
584
         * Get configuration from tsConfig
585
         */
586
587
        // Get page TSconfig for page ID:
588 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
589
590 2
        $res = [];
591
592 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
593 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
594
595 1
            if (is_array($crawlerCfg['paramSets.'])) {
596 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
597 1
                    if (is_array($values)) {
598 1
                        $key = str_replace('.', '', $key);
599
                        // Sub configuration for a single configuration string:
600 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
601 1
                        $subCfg['key'] = $key;
602
603 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
604 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
605
                        }
606 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
607
608
                        // process configuration if it is not page-specific or if the specific page is the current page:
609 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
610
611
                                // add trailing slash if not present
612 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
613
                                $subCfg['baseUrl'] .= '/';
614
                            }
615
616
                            // Explode, process etc.:
617 1
                            $res[$key] = [];
618 1
                            $res[$key]['subCfg'] = $subCfg;
619 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
620 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
621 1
                            $res[$key]['origin'] = 'pagets';
622
623
                            // recognize MP value
624 1
                            if (!$this->MP) {
625 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
626
                            } else {
627
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
628
                            }
629
                        }
630
                    }
631
                }
632
            }
633
        }
634
635
        /**
636
         * Get configuration from tx_crawler_configuration records
637
         */
638
639
        // get records along the rootline
640 2
        $rootLine = BackendUtility::BEgetRootLine($id);
641
642 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
643
        $queryBuilder
644 2
            ->getRestrictions()->removeAll()
645 2
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
646 2
            ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
647
648 2
        foreach ($rootLine as $page) {
649
            $configurationRecordsForCurrentPage = $queryBuilder
650 2
                ->select('*')
651 2
                ->from('tx_crawler_configuration')
652 2
                ->where(
653 2
                    $queryBuilder->expr()->eq('pid', $page['uid'])
654
                )
655 2
                ->execute()
656 2
                ->fetchAll();
657
658 2
            if (is_array($configurationRecordsForCurrentPage)) {
659 2
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
660
661
                        // check access to the configuration record
662 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
663 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
664
665
                        // process configuration if it is not page-specific or if the specific page is the current page:
666 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
667 1
                            $key = $configurationRecord['name'];
668
669
                            // don't overwrite previously defined paramSets
670 1
                            if (!isset($res[$key])) {
671
672
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
673 1
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
674 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
675
676 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
677
678
                                $subCfg = [
679 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
680 1
                                    'procInstrParams.' => $TSparserObject->setup,
681 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
682 1
                                        $configurationRecord['base_url'],
683 1
                                        $configurationRecord['sys_domain_base_url'],
684 1
                                        $isCrawlingProtocolHttps
685
                                    ),
686 1
                                    'cHash' => $configurationRecord['chash'],
687 1
                                    'userGroups' => $configurationRecord['fegroups'],
688 1
                                    'exclude' => $configurationRecord['exclude'],
689 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
690 1
                                    'key' => $key
691
                                ];
692
693
                                // add trailing slash if not present
694 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
695
                                    $subCfg['baseUrl'] .= '/';
696
                                }
697 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
698 1
                                    $res[$key] = [];
699 1
                                    $res[$key]['subCfg'] = $subCfg;
700 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
701 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
702 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
703 1
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
704
                                }
705
                            }
706
                        }
707
                    }
708
                }
709
            }
710
        }
711
712 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
713
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
714
                $params = [
715
                    'res' => &$res,
716
                ];
717
                GeneralUtility::callUserFunction($func, $params, $this);
718
            }
719
        }
720
721 2
        return $res;
722
    }
723
724
    /**
725
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
726
     *
727
     * @param string $baseUrl
728
     * @param integer $sysDomainUid
729
     * @param bool $ssl
730
     * @return string
731
     */
732 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
733
    {
734 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
735 4
        $sysDomainUid = intval($sysDomainUid);
736 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
737
738 4
        if ($sysDomainUid > 0) {
739
            $statement = $queryBuilder
740 2
                ->from('sys_domain')
741 2
                ->select('*')
742 2
                ->where(
743 2
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
744
                )
745 2
                ->execute();
746
747 2
            $row = $statement->fetch(0);
748 2
            if ($row['domainName'] != '') {
749 1
                return $urlScheme . '://' . $row['domainName'];
750
            }
751
        }
752 3
        return $baseUrl;
753
    }
754
755
    /**
756
     * @param $rootid
757
     * @param $depth
758
     * @return array
759
     *
760
     * TODO: Write Functional Tests
761
     */
762
    public function getConfigurationsForBranch($rootid, $depth)
763
    {
764
        $configurationsForBranch = [];
765
766
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
767
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
768
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
769
            if (is_array($sets)) {
770
                foreach ($sets as $key => $value) {
771
                    if (!is_array($value)) {
772
                        continue;
773
                    }
774
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
775
                }
776
            }
777
        }
778
        $pids = [];
779
        $rootLine = BackendUtility::BEgetRootLine($rootid);
780
        foreach ($rootLine as $node) {
781
            $pids[] = $node['uid'];
782
        }
783
        /* @var PageTreeView $tree */
784
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
785
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
786
        $tree->init('AND ' . $perms_clause);
787
        $tree->getTree($rootid, $depth, '');
788
        foreach ($tree->tree as $node) {
789
            $pids[] = $node['row']['uid'];
790
        }
791
792
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
793
794
        $queryBuilder->getRestrictions()
795
            ->removeAll()
796
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
797
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
798
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
799
800
        $statement = $queryBuilder
801
            ->select('name')
802
            ->from('tx_crawler_configuration')
803
            ->where(
804
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
805
            )
806
        ->execute();
807
808
        while ($row = $statement->fetch()) {
809
            $configurationsForBranch[] = $row['name'];
810
        }
811
812
        return $configurationsForBranch;
813
    }
814
815
    /**
816
     * Get querybuilder for given table
817
     *
818
     * @param string $table
819
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
820
     */
821 9
    private function getQueryBuilder(string $table)
822
    {
823 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
824 9
            ->getConnectionForTable($table)
825 9
            ->createQueryBuilder();
826
    }
827
828
    /**
829
     * Check if a user has access to an item
830
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
831
     *
832
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
833
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
834
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
835
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
836
     */
837 3
    public function hasGroupAccess($groupList, $accessList)
838
    {
839 3
        if (empty($accessList)) {
840 1
            return true;
841
        }
842 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
843 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
844 1
                return true;
845
            }
846
        }
847 1
        return false;
848
    }
849
850
    /**
851
     * Parse GET vars of input Query into array with key=>value pairs
852
     *
853
     * @param string $inputQuery Input query string
854
     * @return array
855
     */
856 5
    public function parseParams($inputQuery)
857
    {
858
        //echo '<pre>', var_dump($inputQuery), '</pre>';
859
        // Extract all GET parameters into an ARRAY:
860 5
        $paramKeyValues = [];
861 5
        $GETparams = explode('&', $inputQuery);
862
863 5
        foreach ($GETparams as $paramAndValue) {
864 5
            list($p, $v) = explode('=', $paramAndValue, 2);
865 5
            if (strlen($p)) {
866 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
867
            }
868
        }
869
870 5
        return $paramKeyValues;
871
    }
872
873
    /**
874
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
875
     * Syntax of values:
876
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
877
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
878
     * - For each configuration part:
879
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
880
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
881
     *        _ENABLELANG:1 picks only original records without their language overlays
882
     *         - Default: Literal value
883
     *
884
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
885
     * @param integer $pid Current page ID
886
     * @return array
887
     *
888
     * TODO: Write Functional Tests
889
     */
890 2
    public function expandParameters($paramArray, $pid)
891
    {
892 2
        global $TCA;
893
894
        // Traverse parameter names:
895 2
        foreach ($paramArray as $p => $v) {
896 2
            $v = trim($v);
897
898
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
899 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
900
                // So, find the value inside brackets and reset the paramArray value as an array.
901 2
                $v = substr($v, 1, -1);
902 2
                $paramArray[$p] = [];
903
904
                // Explode parts and traverse them:
905 2
                $parts = explode('|', $v);
906 2
                foreach ($parts as $pV) {
907
908
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
909 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
910
911
                        // Swap if first is larger than last:
912
                        if ($reg[1] > $reg[2]) {
913
                            $temp = $reg[2];
914
                            $reg[2] = $reg[1];
915
                            $reg[1] = $temp;
916
                        }
917
918
                        // Traverse range, add values:
919
                        $runAwayBrake = 1000; // Limit to size of range!
920
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
921
                            $paramArray[$p][] = $a;
922
                            $runAwayBrake--;
923
                            if ($runAwayBrake <= 0) {
924
                                break;
925
                            }
926
                        }
927 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
928
929
                        // Parse parameters:
930
                        $subparts = GeneralUtility::trimExplode(';', $pV);
931
                        $subpartParams = [];
932
                        foreach ($subparts as $spV) {
933
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
934
                            $subpartParams[$pKey] = $pVal;
935
                        }
936
937
                        // Table exists:
938
                        if (isset($TCA[$subpartParams['_TABLE']])) {
939
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
940
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
941
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
942
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
943
944
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
945
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
946
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
947
948
                                $queryBuilder->getRestrictions()
949
                                    ->removeAll()
950
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
951
952
                                $queryBuilder
953
                                    ->select($fieldName)
954
                                    ->from($subpartParams['_TABLE'])
955
                                    // TODO: Check if this works as intended!
956
                                    ->add('from', $addTable)
957
                                    ->where(
958
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
959
                                        $where
960
                                    );
961
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
962
963
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
964
                                    $queryBuilder->andWhere(
965
                                        $queryBuilder->expr()->lte(
966
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
967
                                            0
968
                                        )
969
                                    );
970
                                }
971
972
                                $statement = $queryBuilder->execute();
973
974
                                $rows = [];
975
                                while ($row = $statement->fetch()) {
976
                                    $rows[$fieldName] = $row;
977
                                }
978
979
                                if (is_array($rows)) {
980
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
981
                                }
982
                            }
983
                        }
984
                    } else { // Just add value:
985 2
                        $paramArray[$p][] = $pV;
986
                    }
987
                    // Hook for processing own expandParameters place holder
988 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
989
                        $_params = [
990
                            'pObj' => &$this,
991
                            'paramArray' => &$paramArray,
992
                            'currentKey' => $p,
993
                            'currentValue' => $pV,
994
                            'pid' => $pid
995
                        ];
996
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
997
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
998
                        }
999
                    }
1000
                }
1001
1002
                // Make unique set of values and sort array by key:
1003 2
                $paramArray[$p] = array_unique($paramArray[$p]);
1004 2
                ksort($paramArray);
1005
            } else {
1006
                // Set the literal value as only value in array:
1007 2
                $paramArray[$p] = [$v];
1008
            }
1009
        }
1010
1011 2
        return $paramArray;
1012
    }
1013
1014
    /**
1015
     * Compiling URLs from parameter array (output of expandParameters())
1016
     * The number of URLs will be the multiplication of the number of parameter values for each key
1017
     *
1018
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1019
     * @param array $urls URLs accumulated in this array (for recursion)
1020
     * @return array
1021
     */
1022 5
    public function compileUrls($paramArray, $urls = [])
1023
    {
1024 5
        if (count($paramArray) && is_array($urls)) {
1025
            // shift first off stack:
1026 4
            reset($paramArray);
1027 4
            $varName = key($paramArray);
1028 4
            $valueSet = array_shift($paramArray);
1029
1030
            // Traverse value set:
1031 4
            $newUrls = [];
1032 4
            foreach ($urls as $url) {
1033 3
                foreach ($valueSet as $val) {
1034 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1035
1036 3
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1037
                        break;
1038
                    }
1039
                }
1040
            }
1041 4
            $urls = $newUrls;
1042 4
            $urls = $this->compileUrls($paramArray, $urls);
1043
        }
1044
1045 5
        return $urls;
1046
    }
1047
1048
    /************************************
1049
     *
1050
     * Crawler log
1051
     *
1052
     ************************************/
1053
1054
    /**
1055
     * Return array of records from crawler queue for input page ID
1056
     *
1057
     * @param integer $id Page ID for which to look up log entries.
1058
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1059
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1060
     * @param boolean $doFullFlush
1061
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1062
     * @return array
1063
     */
1064 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1065
    {
1066 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1067
        $queryBuilder
1068 4
            ->select('*')
1069 4
            ->from($this->tableName)
1070 4
            ->where(
1071 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1072
            )
1073 4
            ->orderBy('scheduled', 'DESC');
1074
1075 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1076 4
            ->getConnectionForTable($this->tableName)
1077 4
            ->getExpressionBuilder();
1078 4
        $query = $expressionBuilder->andX();
1079
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1080
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1081
        // between the statements, it's not a mistake in the code.
1082 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1083 4
        switch ($filter) {
1084 4
            case 'pending':
1085
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1086
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1087
                break;
1088 4
            case 'finished':
1089
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1090
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1091
                break;
1092
        }
1093
1094
        // FIXME: Write unit test that ensures that the right records are deleted.
1095 4
        if ($doFlush) {
1096 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1097 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1098 2
            return [];
1099
        } else {
1100 2
            if ($itemsPerPage > 0) {
1101
                $queryBuilder
1102 2
                    ->setMaxResults((int)$itemsPerPage);
1103
            }
1104
1105 2
            return $queryBuilder->execute()->fetchAll();
1106
        }
1107
    }
1108
1109
    /**
1110
     * Return array of records from crawler queue for input set ID
1111
     *
1112
     * @param integer $set_id Set ID for which to look up log entries.
1113
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1114
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1115
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1116
     * @return array
1117
     */
1118 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1119
    {
1120 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1121
        $queryBuilder
1122 6
            ->select('*')
1123 6
            ->from($this->tableName)
1124 6
            ->where(
1125 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1126
            )
1127 6
            ->orderBy('scheduled', 'DESC');
1128
1129 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1130 6
            ->getConnectionForTable($this->tableName)
1131 6
            ->getExpressionBuilder();
1132 6
        $query = $expressionBuilder->andX();
1133
        // FIXME: Write Unit tests for Filters
1134
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1135
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1136
        // between the statements, it's not a mistake in the code.
1137 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1138 6
        switch ($filter) {
1139 6
            case 'pending':
1140 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1141 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1142 1
                break;
1143 5
            case 'finished':
1144 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1145 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1146 1
                break;
1147
        }
1148
        // FIXME: Write unit test that ensures that the right records are deleted.
1149 6
        if ($doFlush) {
1150 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1151 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1152 4
            return [];
1153
        } else {
1154 2
            if ($itemsPerPage > 0) {
1155
                $queryBuilder
1156 2
                    ->setMaxResults((int)$itemsPerPage);
1157
            }
1158
1159 2
            return $queryBuilder->execute()->fetchAll();
1160
        }
1161
    }
1162
1163
    /**
1164
     * Removes queue entries
1165
     *
1166
     * @param string $where SQL related filter for the entries which should be removed
1167
     * @return void
1168
     */
1169 9
    protected function flushQueue($where = '')
1170
    {
1171 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1172
1173 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1174
1175 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1176
            $groups = $queryBuilder
1177
                ->select('DISTINCT set_id')
1178
                ->from($this->tableName)
1179
                ->where($realWhere)
1180
                ->execute()
1181
                ->fetchAll();
1182
            if (is_array($groups)) {
1183
                foreach ($groups as $group) {
1184
                    $subSet = $queryBuilder
1185
                        ->select('uid', 'set_id')
1186
                        ->from($this->tableName)
1187
                        ->where(
1188
                            $realWhere,
1189
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1190
                        )
1191
                        ->execute()
1192
                        ->fetchAll();
1193
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1194
                }
1195
            }
1196
        }
1197
1198
        $queryBuilder
1199 9
            ->delete($this->tableName)
1200 9
            ->where($realWhere)
1201 9
            ->execute();
1202 9
    }
1203
1204
    /**
1205
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1206
     *
1207
     * @param integer $setId Set ID
1208
     * @param array $params Parameters to pass to call back function
1209
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1210
     * @param integer $page_id Page ID to attach it to
1211
     * @param integer $schedule Time at which to activate
1212
     * @return void
1213
     */
1214
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1215
    {
1216
        if (!is_array($params)) {
1217
            $params = [];
1218
        }
1219
        $params['_CALLBACKOBJ'] = $callBack;
1220
1221
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1222
            ->insert(
1223
                'tx_crawler_queue',
1224
                [
1225
                    'page_id' => intval($page_id),
1226
                    'parameters' => serialize($params),
1227
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1228
                    'exec_time' => 0,
1229
                    'set_id' => intval($setId),
1230
                    'result_data' => '',
1231
                ]
1232
            );
1233
    }
1234
1235
    /************************************
1236
     *
1237
     * URL setting
1238
     *
1239
     ************************************/
1240
1241
    /**
1242
     * Setting a URL for crawling:
1243
     *
1244
     * @param integer $id Page ID
1245
     * @param string $url Complete URL
1246
     * @param array $subCfg Sub configuration array (from TS config)
1247
     * @param integer $tstamp Scheduled-time
1248
     * @param string $configurationHash (optional) configuration hash
1249
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1250
     * @return bool
1251
     */
1252 2
    public function addUrl(
1253
        $id,
1254
        $url,
1255
        array $subCfg,
1256
        $tstamp,
1257
        $configurationHash = '',
1258
        $skipInnerDuplicationCheck = false
1259
    ) {
1260 2
        $urlAdded = false;
1261 2
        $rows = [];
1262
1263
        // Creating parameters:
1264
        $parameters = [
1265 2
            'url' => $url
1266
        ];
1267
1268
        // fe user group simulation:
1269 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1270 2
        if ($uGs) {
1271
            $parameters['feUserGroupList'] = $uGs;
1272
        }
1273
1274
        // Setting processing instructions
1275 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1276 2
        if (is_array($subCfg['procInstrParams.'])) {
1277 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1278
        }
1279
1280
        // Possible TypoScript Template Parents
1281 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1282
1283
        // Compile value array:
1284 2
        $parameters_serialized = serialize($parameters);
1285
        $fieldArray = [
1286 2
            'page_id' => intval($id),
1287 2
            'parameters' => $parameters_serialized,
1288 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1289 2
            'configuration_hash' => $configurationHash,
1290 2
            'scheduled' => $tstamp,
1291 2
            'exec_time' => 0,
1292 2
            'set_id' => intval($this->setID),
1293 2
            'result_data' => '',
1294 2
            'configuration' => $subCfg['key'],
1295
        ];
1296
1297 2
        if ($this->registerQueueEntriesInternallyOnly) {
1298
            //the entries will only be registered and not stored to the database
1299
            $this->queueEntries[] = $fieldArray;
1300
        } else {
1301 2
            if (!$skipInnerDuplicationCheck) {
1302
                // check if there is already an equal entry
1303 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1304
            }
1305
1306 2
            if (count($rows) == 0) {
1307 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1308 2
                $connectionForCrawlerQueue->insert(
1309 2
                    'tx_crawler_queue',
1310 2
                    $fieldArray
1311
                );
1312 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1313 2
                $rows[] = $uid;
1314 2
                $urlAdded = true;
1315 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1316
            } else {
1317
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1318
            }
1319
        }
1320
1321 2
        return $urlAdded;
1322
    }
1323
1324
    /**
1325
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1326
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1327
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1328
     *
1329
     * @param int $tstamp
1330
     * @param array $fieldArray
1331
     *
1332
     * @return array
1333
     *
1334
     * TODO: Write Functional Tests
1335
     */
1336 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1337
    {
1338 2
        $rows = [];
1339
1340 2
        $currentTime = $this->getCurrentTime();
1341
1342 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1343
        $queryBuilder
1344 2
            ->select('qid')
1345 2
            ->from('tx_crawler_queue');
1346
        //if this entry is scheduled with "now"
1347 2
        if ($tstamp <= $currentTime) {
1348
            if ($this->extensionSettings['enableTimeslot']) {
1349
                $timeBegin = $currentTime - 100;
1350
                $timeEnd = $currentTime + 100;
1351
                $queryBuilder
1352
                    ->where(
1353
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1354
                    )
1355
                    ->orWhere(
1356
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1357
                    );
1358
            } else {
1359
                $queryBuilder
1360
                    ->where(
1361
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1362
                    );
1363
            }
1364 2
        } elseif ($tstamp > $currentTime) {
1365
            //entry with a timestamp in the future need to have the same schedule time
1366
            $queryBuilder
1367 2
                ->where(
1368 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1369
                );
1370
        }
1371
1372
        $statement = $queryBuilder
1373 2
            ->andWhere('exec_time != 0')
1374 2
            ->andWhere('process_id != 0')
1375 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1376 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1377 2
            ->execute();
1378
1379 2
        while ($row = $statement->fetch()) {
1380
            $rows[] = $row['qid'];
1381
        }
1382
1383 2
        return $rows;
1384
    }
1385
1386
    /**
1387
     * Returns the current system time
1388
     *
1389
     * @return int
1390
     */
1391
    public function getCurrentTime()
1392
    {
1393
        return time();
1394
    }
1395
1396
    /************************************
1397
     *
1398
     * URL reading
1399
     *
1400
     ************************************/
1401
1402
    /**
1403
     * Read URL for single queue entry
1404
     *
1405
     * @param integer $queueId
1406
     * @param boolean $force If set, will process even if exec_time has been set!
1407
     * @return integer
1408
     */
1409
    public function readUrl($queueId, $force = false)
1410
    {
1411
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1412
        $ret = 0;
1413
        if ($this->debugMode) {
1414
            $this->getLogger()->log(
1415
                LogLevel::DEBUG,
1416
                'crawler-readurl start ' . microtime(true)
1417
            );
1418
        }
1419
        // Get entry:
1420
        $queryBuilder
1421
            ->select('*')
1422
            ->from('tx_crawler_queue')
1423
            ->where(
1424
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1425
            );
1426
        if (!$force) {
1427
            $queryBuilder
1428
                ->andWhere('exec_time = 0')
1429
                ->andWhere('process_scheduled > 0');
1430
        }
1431
        $queueRec = $queryBuilder->execute()->fetch();
1432
1433
        if (!is_array($queueRec)) {
1434
            return;
1435
        }
1436
1437
        $parameters = unserialize($queueRec['parameters']);
1438
        if ($parameters['rootTemplatePid']) {
1439
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1440
        } else {
1441
            $this->getLogger()->log(
1442
                LogLevel::WARNING,
1443
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1444
            );
1445
        }
1446
1447
        SignalSlotUtility::emitSignal(
1448
            __CLASS__,
1449
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1450
            [$queueId, &$queueRec]
1451
        );
1452
1453
        // Set exec_time to lock record:
1454
        $field_array = ['exec_time' => $this->getCurrentTime()];
1455
1456
        if (isset($this->processID)) {
1457
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1458
            $field_array['process_id_completed'] = $this->processID;
1459
        }
1460
1461
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1462
            ->update(
1463
                'tx_crawler_queue',
1464
                $field_array,
1465
                [ 'qid' => (int)$queueId ]
1466
            );
1467
1468
        $result = $this->readUrl_exec($queueRec);
1469
        $resultData = unserialize($result['content']);
1470
1471
        //atm there's no need to point to specific pollable extensions
1472
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1473
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1474
                // only check the success value if the instruction is runnig
1475
                // it is important to name the pollSuccess key same as the procInstructions key
1476
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1477
                    $pollable,
1478
                    $resultData['parameters']['procInstructions']
1479
                )
1480
                ) {
1481
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1482
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1483
                    }
1484
                }
1485
            }
1486
        }
1487
1488
        // Set result in log which also denotes the end of the processing of this entry.
1489
        $field_array = ['result_data' => serialize($result)];
1490
1491
        SignalSlotUtility::emitSignal(
1492
            __CLASS__,
1493
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1494
            [$queueId, &$field_array]
1495
        );
1496
1497
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1498
            ->update(
1499
                'tx_crawler_queue',
1500
                $field_array,
1501
                [ 'qid' => (int)$queueId ]
1502
            );
1503
1504
        if ($this->debugMode) {
1505
            $this->getLogger()->log(
1506
                LogLevel::DEBUG,
1507
                'crawler-readurl stop ' . microtime(true)
1508
            );
1509
        }
1510
1511
        return $ret;
1512
    }
1513
1514
    /**
1515
     * Read URL for not-yet-inserted log-entry
1516
     *
1517
     * @param array $field_array Queue field array,
1518
     *
1519
     * @return string
1520
     */
1521
    public function readUrlFromArray($field_array)
1522
    {
1523
1524
            // Set exec_time to lock record:
1525
        $field_array['exec_time'] = $this->getCurrentTime();
1526
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1527
        $connectionForCrawlerQueue->insert(
1528
            'tx_crawler_queue',
1529
            $field_array
1530
        );
1531
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1532
1533
        $result = $this->readUrl_exec($field_array);
1534
1535
        // Set result in log which also denotes the end of the processing of this entry.
1536
        $field_array = ['result_data' => serialize($result)];
1537
1538
        SignalSlotUtility::emitSignal(
1539
            __CLASS__,
1540
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1541
            [$queueId, &$field_array]
1542
        );
1543
1544
        $connectionForCrawlerQueue->update(
1545
            'tx_crawler_queue',
1546
            $field_array,
1547
            ['qid' => $queueId]
1548
        );
1549
1550
        return $result;
1551
    }
1552
1553
    /**
1554
     * Read URL for a queue record
1555
     *
1556
     * @param array $queueRec Queue record
1557
     * @return string
1558
     */
1559
    public function readUrl_exec($queueRec)
1560
    {
1561
        // Decode parameters:
1562
        $parameters = unserialize($queueRec['parameters']);
1563
        $result = 'ERROR';
1564
        if (is_array($parameters)) {
1565
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1566
                $objRef = $parameters['_CALLBACKOBJ'];
1567
                $callBackObj = GeneralUtility::makeInstance($objRef);
1568
                if (is_object($callBackObj)) {
1569
                    unset($parameters['_CALLBACKOBJ']);
1570
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1571
                } else {
1572
                    $result = ['content' => 'No object: ' . $objRef];
1573
                }
1574
            } else { // Regular FE request:
1575
1576
                // Prepare:
1577
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1578
1579
                // Get result:
1580
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1581
1582
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1583
            }
1584
        }
1585
1586
        return $result;
1587
    }
1588
1589
    /**
1590
     * Gets the content of a URL.
1591
     *
1592
     * @param string $originalUrl URL to read
1593
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1594
     * @param integer $timeout Timeout time
1595
     * @param integer $recursion Recursion limiter for 302 redirects
1596
     * @return array|boolean
1597
     */
1598 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1599
    {
1600 2
        if (!$recursion) {
1601
            return false;
1602
        }
1603
1604
        // Parse URL, checking for scheme:
1605 2
        $url = parse_url($originalUrl);
1606
1607 2
        if ($url === false) {
1608
            $this->getLogger()->log(
1609
                LogLevel::DEBUG,
1610
                sprintf('Could not parse_url() for string "%s"', $url),
1611
                ['crawlerId' => $crawlerId]
1612
            );
1613
            return false;
1614
        }
1615
1616 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1617
            $this->getLogger()->log(
1618
                LogLevel::DEBUG,
1619
                sprintf('Scheme does not match for url "%s"', $url),
1620
                ['crawlerId' => $crawlerId]
1621
            );
1622
            return false;
1623
        }
1624
1625
        // direct request
1626 2
        if ($this->extensionSettings['makeDirectRequests']) {
1627 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1628 2
            return $result;
1629
        }
1630
1631
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1632
1633
        // thanks to Pierrick Caillon for adding proxy support
1634
        $rurl = $url;
1635
1636
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1637
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1638
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1639
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1640
        }
1641
1642
        $host = $rurl['host'];
1643
1644
        if ($url['scheme'] == 'https') {
1645
            $host = 'ssl://' . $host;
1646
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1647
        } else {
1648
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1649
        }
1650
1651
        $startTime = microtime(true);
1652
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1653
1654
        if (!$fp) {
1655
            $this->getLogger()->log(
1656
                LogLevel::DEBUG,
1657
                sprintf('Error while opening "%s"', $url),
1658
                ['crawlerId' => $crawlerId]
1659
            );
1660
            return false;
1661
        } else {
1662
            // Request message:
1663
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1664
            fputs($fp, $msg);
1665
1666
            // Read response:
1667
            $d = $this->getHttpResponseFromStream($fp);
1668
            fclose($fp);
1669
1670
            $time = microtime(true) - $startTime;
1671
            $this->log($originalUrl . ' ' . $time);
1672
1673
            // Implode content and headers:
1674
            $result = [
1675
                'request' => $msg,
1676
                'headers' => implode('', $d['headers']),
1677
                'content' => implode('', (array)$d['content'])
1678
            ];
1679
1680
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1681
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1680 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1682
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1680 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1683
1684
                if (is_array($newRequestUrl)) {
1685
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1686
                } else {
1687
                    $this->getLogger()->log(
1688
                        LogLevel::DEBUG,
1689
                        sprintf('Error while opening "%s"', $url),
1690
                        ['crawlerId' => $crawlerId]
1691
                    );
1692
                    return false;
1693
                }
1694
            }
1695
1696
            return $result;
1697
        }
1698
    }
1699
1700
    /**
1701
     * Gets the base path of the website frontend.
1702
     * (e.g. if you call http://mydomain.com/cms/index.php in
1703
     * the browser the base path is "/cms/")
1704
     *
1705
     * @return string Base path of the website frontend
1706
     */
1707
    protected function getFrontendBasePath()
1708
    {
1709
        $frontendBasePath = '/';
1710
1711
        // Get the path from the extension settings:
1712
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1713
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1714
        // If empty, try to use config.absRefPrefix:
1715
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1716
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1717
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1718
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1719
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1720
        }
1721
1722
        // Base path must be '/<pathSegements>/':
1723
        if ($frontendBasePath !== '/') {
1724
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1725
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1726
        }
1727
1728
        return $frontendBasePath;
1729
    }
1730
1731
    /**
1732
     * Executes a shell command and returns the outputted result.
1733
     *
1734
     * @param string $command Shell command to be executed
1735
     * @return string Outputted result of the command execution
1736
     */
1737
    protected function executeShellCommand($command)
1738
    {
1739
        return shell_exec($command);
1740
    }
1741
1742
    /**
1743
     * Reads HTTP response from the given stream.
1744
     *
1745
     * @param  resource $streamPointer  Pointer to connection stream.
1746
     * @return array                    Associative array with the following items:
1747
     *                                  headers <array> Response headers sent by server.
1748
     *                                  content <array> Content, with each line as an array item.
1749
     */
1750 1
    protected function getHttpResponseFromStream($streamPointer)
1751
    {
1752 1
        $response = ['headers' => [], 'content' => []];
1753
1754 1
        if (is_resource($streamPointer)) {
1755
            // read headers
1756 1
            while ($line = fgets($streamPointer, '2048')) {
1757 1
                $line = trim($line);
1758 1
                if ($line !== '') {
1759 1
                    $response['headers'][] = $line;
1760
                } else {
1761 1
                    break;
1762
                }
1763
            }
1764
1765
            // read content
1766 1
            while ($line = fgets($streamPointer, '2048')) {
1767 1
                $response['content'][] = $line;
1768
            }
1769
        }
1770
1771 1
        return $response;
1772
    }
1773
1774
    /**
1775
     * @param message
1776
     */
1777 2
    protected function log($message)
1778
    {
1779 2
        if (!empty($this->extensionSettings['logFileName'])) {
1780
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1781
            if (!$fileResult) {
1782
                $this->getLogger()->log(
1783
                    LogLevel::INFO,
1784
                    sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1785
                );
1786
            }
1787
        }
1788 2
    }
1789
1790
    /**
1791
     * Builds HTTP request headers.
1792
     *
1793
     * @param array $url
1794
     * @param string $crawlerId
1795
     *
1796
     * @return array
1797
     */
1798 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1799
    {
1800 6
        $reqHeaders = [];
1801 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1802 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1803 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1804 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1805
        }
1806 6
        $reqHeaders[] = 'Connection: close';
1807 6
        if ($url['user'] != '') {
1808 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1809
        }
1810 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1811 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1812 6
        return $reqHeaders;
1813
    }
1814
1815
    /**
1816
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1817
     *
1818
     * @param array $headers HTTP Header
1819
     * @param string $user HTTP Auth. User
1820
     * @param string $pass HTTP Auth. Password
1821
     * @return bool|string
1822
     */
1823 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1824
    {
1825 12
        $header = [];
1826 12
        if (!is_array($headers)) {
1827 1
            return false;
1828
        }
1829 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1830 2
            return false;
1831
        }
1832
1833 9
        foreach ($headers as $hl) {
1834 9
            $tmp = explode(": ", $hl);
1835 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1836 9
            if (trim($tmp[0]) == 'Location') {
1837 6
                break;
1838
            }
1839
        }
1840 9
        if (!array_key_exists('Location', $header)) {
1841 3
            return false;
1842
        }
1843
1844 6
        if ($user != '') {
1845 3
            if (!($tmp = parse_url($header['Location']))) {
1846 1
                return false;
1847
            }
1848 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1849 2
            if ($tmp['query'] != '') {
1850 2
                $newUrl .= '?' . $tmp['query'];
1851
            }
1852
        } else {
1853 3
            $newUrl = $header['Location'];
1854
        }
1855 5
        return $newUrl;
1856
    }
1857
1858
    /**************************
1859
     *
1860
     * tslib_fe hooks:
1861
     *
1862
     **************************/
1863
1864
    /**
1865
     * Initialization hook (called after database connection)
1866
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1867
     *
1868
     * @param array $params Parameters from frontend
1869
     * @param object $ref TSFE object (reference under PHP5)
1870
     * @return void
1871
     *
1872
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1873
     * FIXME: I think this can be removed. (TNM)
1874
     */
1875
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1876
    {
1877
        // Authenticate crawler request:
1878
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1879
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1880
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1881
1882
            $queueRec = $queryBuilder
1883
                ->select('*')
1884
                ->from('tx_crawler_queue')
1885
                ->where(
1886
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1887
                )
1888
                ->execute()
1889
                ->fetch();
1890
1891
            // If a crawler record was found and hash was matching, set it up:
1892
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1893
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1894
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1895
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1896
            } else {
1897
                die('No crawler entry found!');
1898
            }
1899
        }
1900
    }
1901
1902
    /*****************************
1903
     *
1904
     * Compiling URLs to crawl - tools
1905
     *
1906
     *****************************/
1907
1908
    /**
1909
     * @param integer $id Root page id to start from.
1910
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1911
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1912
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1913
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1914
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1915
     * @param array $incomingProcInstructions Array of processing instructions
1916
     * @param array $configurationSelection Array of configuration keys
1917
     * @return string
1918
     */
1919
    public function getPageTreeAndUrls(
1920
        $id,
1921
        $depth,
1922
        $scheduledTime,
1923
        $reqMinute,
1924
        $submitCrawlUrls,
1925
        $downloadCrawlUrls,
1926
        array $incomingProcInstructions,
1927
        array $configurationSelection
1928
    ) {
1929
        global $LANG;
1930
        if (!is_object($LANG)) {
1931
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1932
            $LANG->init(0);
1933
        }
1934
        $this->scheduledTime = $scheduledTime;
1935
        $this->reqMinute = $reqMinute;
1936
        $this->submitCrawlUrls = $submitCrawlUrls;
1937
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1938
        $this->incomingProcInstructions = $incomingProcInstructions;
1939
        $this->incomingConfigurationSelection = $configurationSelection;
1940
1941
        $this->duplicateTrack = [];
1942
        $this->downloadUrls = [];
1943
1944
        // Drawing tree:
1945
        /* @var PageTreeView $tree */
1946
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1947
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1948
        $tree->init('AND ' . $perms_clause);
1949
1950
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1951
        if (is_array($pageInfo)) {
1952
            // Set root row:
1953
            $tree->tree[] = [
1954
                'row' => $pageInfo,
1955
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1956
            ];
1957
        }
1958
1959
        // Get branch beneath:
1960
        if ($depth) {
1961
            $tree->getTree($id, $depth, '');
1962
        }
1963
1964
        // Traverse page tree:
1965
        $code = '';
1966
1967
        foreach ($tree->tree as $data) {
1968
            $this->MP = false;
1969
1970
            // recognize mount points
1971
            if ($data['row']['doktype'] == 7) {
1972
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1973
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1974
                $mountpage = $queryBuilder
1975
                    ->select('*')
1976
                    ->from('pages')
1977
                    ->where(
1978
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1979
                    )
1980
                    ->execute()
1981
                    ->fetchAll();
1982
                $queryBuilder->getRestrictions()->reset();
1983
1984
                // fetch mounted pages
1985
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1986
1987
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1988
                $mountTree->init('AND ' . $perms_clause);
1989
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1990
1991
                foreach ($mountTree->tree as $mountData) {
1992
                    $code .= $this->drawURLs_addRowsForPage(
1993
                        $mountData['row'],
1994
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1995
                    );
1996
                }
1997
1998
                // replace page when mount_pid_ol is enabled
1999
                if ($mountpage[0]['mount_pid_ol']) {
2000
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
2001
                } else {
2002
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
2003
                    $this->MP = false;
2004
                }
2005
            }
2006
2007
            $code .= $this->drawURLs_addRowsForPage(
2008
                $data['row'],
2009
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
2010
            );
2011
        }
2012
2013
        return $code;
2014
    }
2015
2016
    /**
2017
     * Expands exclude string
2018
     *
2019
     * @param string $excludeString Exclude string
2020
     * @return array
2021
     */
2022 1
    public function expandExcludeString($excludeString)
2023
    {
2024
        // internal static caches;
2025 1
        static $expandedExcludeStringCache;
2026 1
        static $treeCache;
2027
2028 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
2029 1
            $pidList = [];
2030
2031 1
            if (!empty($excludeString)) {
2032
                /** @var PageTreeView $tree */
2033
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
2034
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
2035
2036
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2037
2038
                foreach ($excludeParts as $excludePart) {
2039
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2040
2041
                    // default is "page only" = "depth=0"
2042
                    if (empty($depth)) {
2043
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2044
                    }
2045
2046
                    $pidList[] = $pid;
2047
2048
                    if ($depth > 0) {
2049
                        if (empty($treeCache[$pid][$depth])) {
2050
                            $tree->reset();
2051
                            $tree->getTree($pid, $depth);
2052
                            $treeCache[$pid][$depth] = $tree->tree;
2053
                        }
2054
2055
                        foreach ($treeCache[$pid][$depth] as $data) {
2056
                            $pidList[] = $data['row']['uid'];
2057
                        }
2058
                    }
2059
                }
2060
            }
2061
2062 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2063
        }
2064
2065 1
        return $expandedExcludeStringCache[$excludeString];
2066
    }
2067
2068
    /**
2069
     * Create the rows for display of the page tree
2070
     * For each page a number of rows are shown displaying GET variable configuration
2071
     *
2072
     * @param    array        Page row
2073
     * @param    string        Page icon and title for row
2074
     * @return    string        HTML <tr> content (one or more)
2075
     */
2076
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2077
    {
2078
        $skipMessage = '';
2079
2080
        // Get list of configurations
2081
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2082
2083
        if (count($this->incomingConfigurationSelection) > 0) {
2084
            // remove configuration that does not match the current selection
2085
            foreach ($configurations as $confKey => $confArray) {
2086
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2087
                    unset($configurations[$confKey]);
2088
                }
2089
            }
2090
        }
2091
2092
        // Traverse parameter combinations:
2093
        $c = 0;
2094
        $content = '';
2095
        if (count($configurations)) {
2096
            foreach ($configurations as $confKey => $confArray) {
2097
2098
                    // Title column:
2099
                if (!$c) {
2100
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2101
                } else {
2102
                    $titleClm = '';
2103
                }
2104
2105
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2106
2107
                        // URL list:
2108
                    $urlList = $this->urlListFromUrlArray(
2109
                        $confArray,
2110
                        $pageRow,
2111
                        $this->scheduledTime,
2112
                        $this->reqMinute,
2113
                        $this->submitCrawlUrls,
2114
                        $this->downloadCrawlUrls,
2115
                        $this->duplicateTrack,
2116
                        $this->downloadUrls,
2117
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2118
                    );
2119
2120
                    // Expanded parameters:
2121
                    $paramExpanded = '';
2122
                    $calcAccu = [];
2123
                    $calcRes = 1;
2124
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2125
                        $paramExpanded .= '
2126
                            <tr>
2127
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2128
                                                '(' . count($gVal) . ')' .
2129
                                                '</td>
2130
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2131
                            </tr>
2132
                        ';
2133
                        $calcRes *= count($gVal);
2134
                        $calcAccu[] = count($gVal);
2135
                    }
2136
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2137
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2138
2139
                    // Options
2140
                    $optionValues = '';
2141
                    if ($confArray['subCfg']['userGroups']) {
2142
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2143
                    }
2144
                    if ($confArray['subCfg']['baseUrl']) {
2145
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2146
                    }
2147
                    if ($confArray['subCfg']['procInstrFilter']) {
2148
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2149
                    }
2150
2151
                    // Compile row:
2152
                    $content .= '
2153
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2154
                            ' . $titleClm . '
2155
                            <td>' . htmlspecialchars($confKey) . '</td>
2156
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2157
                            <td>' . $paramExpanded . '</td>
2158
                            <td nowrap="nowrap">' . $urlList . '</td>
2159
                            <td nowrap="nowrap">' . $optionValues . '</td>
2160
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2161
                        </tr>';
2162
                } else {
2163
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2164
                            ' . $titleClm . '
2165
                            <td>' . htmlspecialchars($confKey) . '</td>
2166
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2167
                        </tr>';
2168
                }
2169
2170
                $c++;
2171
            }
2172
        } else {
2173
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2174
2175
            // Compile row:
2176
            $content .= '
2177
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2178
                    <td>' . $pageTitleAndIcon . '</td>
2179
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2180
                </tr>';
2181
        }
2182
2183
        return $content;
2184
    }
2185
2186
    /*****************************
2187
     *
2188
     * CLI functions
2189
     *
2190
     *****************************/
2191
2192
    /**
2193
     * Helper function
2194
     *
2195
     * @param string $option Option string, eg. "-s
2196
     * @param int $idx Value index, default is 0 (zero) = the first one...
2197
     * @return string
2198
     */
2199
    private function cli_argValue($option, $idx)
2200
    {
2201
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
0 ignored issues
show
Bug introduced by
The property cli_args does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
2202
    }
2203
2204
    /**
2205
     * Helper function
2206
     *
2207
     * @param string $string The string to output
2208
     */
2209
    private function cli_echo($string)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2210
    {
2211
        $this->outputLine($string);
0 ignored issues
show
Bug introduced by
The method outputLine() does not seem to exist on object<AOE\Crawler\Controller\CrawlerController>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2212
    }
2213
2214
    /**
2215
     * Set cli args
2216
     *
2217
     * This is a copy from the CommandLineController from TYPO3 < v9
2218
     *
2219
     * TODO: Rework
2220
     *
2221
     * @param array $argv
2222
     */
2223
    private function setCliArgs(array $argv)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2224
    {
2225
        $cli_options = [];
2226
        $index = '_DEFAULT';
2227
        foreach ($argv as $token) {
2228
            // Options starting with a number is invalid - they could be negative values!
2229
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2230
                list($index, $opt) = explode('=', $token, 2);
2231
                if (isset($cli_options[$index])) {
2232
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2233
                    die;
2234
                }
2235
                $cli_options[$index] = [];
2236
                if (isset($opt)) {
2237
                    $cli_options[$index][] = $opt;
2238
                }
2239
            } else {
2240
                $cli_options[$index][] = $token;
2241
            }
2242
        }
2243
2244
        $this->cliArgs = $cli_options;
2245
    }
2246
2247
    /**
2248
     * Obtains configuration keys from the CLI arguments
2249
     *
2250
     * @return mixed                        Array of keys or null if no keys found
2251
     */
2252
    protected function getConfigurationKeys()
2253
    {
2254
        $parameter = trim($this->cli_argValue('-conf'));
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2255
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2256
    }
2257
2258
    /**
2259
     * Running the functionality of the CLI (crawling URLs from queue)
2260
     *
2261
     * @param int $countInARun
2262
     * @param int $sleepTime
2263
     * @param int $sleepAfterFinish
2264
     * @return string
2265
     */
2266
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2267
    {
2268
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2269
        $result = 0;
2270
        $counter = 0;
2271
2272
        // First, run hooks:
2273
        $this->CLI_runHooks();
2274
2275
        // Clean up the queue
2276
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2277
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2278
2279
            $del = $queryBuilder
2280
                ->delete($this->tableName)
2281
                ->where(
2282
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2283
                );
2284
            if (false == $del) {
2285
                $this->getLogger()->log(
2286
                    LogLevel::INFO,
2287
                    'Records could not be deleted.'
2288
                );
2289
            }
2290
        }
2291
2292
        // Select entries:
2293
        //TODO Shouldn't this reside within the transaction?
2294
        $rows = $queryBuilder
2295
            ->select('qid', 'scheduled')
2296
            ->from('tx_crawler_queue')
2297
            ->where(
2298
                $queryBuilder->expr()->eq('exec_time', 0),
2299
                $queryBuilder->expr()->eq('process_scheduled', 0),
2300
                $queryBuilder->expr()->lte('scheduled', $this->getCurrentTime())
2301
            )
2302
            ->orderBy('scheduled')
2303
            ->addOrderBy('qid')
2304
            ->setMaxResults($countInARun)
2305
            ->execute()
2306
            ->fetchAll();
2307
2308
        if (count($rows) > 0) {
2309
            $quidList = [];
2310
2311
            foreach ($rows as $r) {
2312
                $quidList[] = $r['qid'];
2313
            }
2314
2315
            $processId = $this->CLI_buildProcessId();
2316
2317
            //reserve queue entries for process
2318
2319
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2320
            //TODO make sure we're not taking assigned queue-entires
2321
2322
            //save the number of assigned queue entrys to determine who many have been processed later
2323
            $numberOfAffectedRows = $queryBuilder
2324
                ->update('tx_crawler_queue')
2325
                ->where(
2326
                    $queryBuilder->expr()->in('qid', $quidList)
2327
                )
2328
                ->set('process_scheduled', $queryBuilder->createNamedParamter($this->getCurrentTime(), \PDO::PARAM_INT))
2329
                ->set('process_id', $processId)
2330
                ->execute();
2331
2332
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2333
                ->update(
2334
                    'tx_crawler_process',
2335
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2336
                    [ 'process_id' => (int) $processId ]
2337
                );
2338
2339
            if ($numberOfAffectedRows == count($quidList)) {
2340
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2341
            } else {
2342
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2343
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2344
                return ($result | self::CLI_STATUS_ABORTED);
2345
            }
2346
2347
            foreach ($rows as $r) {
2348
                $result |= $this->readUrl($r['qid']);
2349
2350
                $counter++;
2351
                usleep(intval($sleepTime)); // Just to relax the system
2352
2353
                // if during the start and the current read url the cli has been disable we need to return from the function
2354
                // mark the process NOT as ended.
2355
                if ($this->getDisabled()) {
2356
                    return ($result | self::CLI_STATUS_ABORTED);
2357
                }
2358
2359
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2360
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2361
2362
                    //TODO might need an additional returncode
2363
                    $result |= self::CLI_STATUS_ABORTED;
2364
                    break; //possible timeout
2365
                }
2366
            }
2367
2368
            sleep(intval($sleepAfterFinish));
2369
2370
            $msg = 'Rows: ' . $counter;
2371
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2372
        } else {
2373
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2374
        }
2375
2376
        if ($counter > 0) {
2377
            $result |= self::CLI_STATUS_PROCESSED;
2378
        }
2379
2380
        return $result;
2381
    }
2382
2383
    /**
2384
     * Activate hooks
2385
     *
2386
     * @return void
2387
     */
2388
    public function CLI_runHooks()
2389
    {
2390
        global $TYPO3_CONF_VARS;
2391
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2392
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2393
                $hookObj = GeneralUtility::makeInstance($objRef);
2394
                if (is_object($hookObj)) {
2395
                    $hookObj->crawler_init($this);
2396
                }
2397
            }
2398
        }
2399
    }
2400
2401
    /**
2402
     * Try to acquire a new process with the given id
2403
     * also performs some auto-cleanup for orphan processes
2404
     * @todo preemption might not be the most elegant way to clean up
2405
     *
2406
     * @param string $id identification string for the process
2407
     * @return boolean
2408
     */
2409
    public function CLI_checkAndAcquireNewProcess($id)
2410
    {
2411
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2412
        $ret = true;
2413
2414
        $systemProcessId = getmypid();
2415
        if ($systemProcessId < 1) {
2416
            return false;
2417
        }
2418
2419
        $processCount = 0;
2420
        $orphanProcesses = [];
2421
2422
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2423
2424
        $statement = $queryBuilder
2425
            ->select('process_id', 'ttl')
2426
            ->from('tx_crawler_process')
2427
            ->where(
2428
                'active = 1 AND deleted = 0'
2429
            )
2430
            ->execute();
2431
2432
        $currentTime = $this->getCurrentTime();
2433
2434
        while ($row = $statement->fetch()) {
2435
            if ($row['ttl'] < $currentTime) {
2436
                $orphanProcesses[] = $row['process_id'];
2437
            } else {
2438
                $processCount++;
2439
            }
2440
        }
2441
2442
        // if there are less than allowed active processes then add a new one
2443
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2444
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2445
2446
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2447
                'tx_crawler_process',
2448
                [
2449
                    'process_id' => $id,
2450
                    'active' => 1,
2451
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2452
                    'system_process_id' => $systemProcessId
2453
                ]
2454
            );
2455
        } else {
2456
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2457
            $ret = false;
2458
        }
2459
2460
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2461
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2462
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2463
2464
        return $ret;
2465
    }
2466
2467
    /**
2468
     * Release a process and the required resources
2469
     *
2470
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2471
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2472
     * @return boolean
2473
     */
2474
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2475
    {
2476
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2477
2478
        if (!is_array($releaseIds)) {
2479
            $releaseIds = [$releaseIds];
2480
        }
2481
2482
        if (!(count($releaseIds) > 0)) {
2483
            return false;   //nothing to release
2484
        }
2485
2486
        if (!$withinLock) {
2487
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2488
        }
2489
2490
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2491
        // this ensures that a single process can't mess up the entire process table
2492
2493
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2494
2495
        $queryBuilder
2496
        ->update('tx_crawler_queue', 'q')
2497
        ->where(
2498
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2499
        )
2500
        ->set('q.process_scheduled', 0)
2501
        ->set('q.process_id', '')
2502
        ->execute();
2503
2504
        // FIXME: Not entirely sure that this is equivalent to the previous version
2505
        $queryBuilder->resetQueryPart('set');
2506
2507
        $queryBuilder
2508
            ->update('tx_crawler_process')
2509
            ->where(
2510
                $queryBuilder->expr()->eq('active', 0),
2511
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2512
            )
2513
            ->set('system_process_id', 0)
2514
            ->execute();
2515
        // previous version for reference
2516
        /*
2517
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2518
            'tx_crawler_process',
2519
            'active=0 AND deleted=0
2520
            AND NOT EXISTS (
2521
                SELECT * FROM tx_crawler_queue
2522
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2523
                AND tx_crawler_queue.exec_time = 0
2524
            )',
2525
            [
2526
                'deleted' => '1',
2527
                'system_process_id' => 0
2528
            ]
2529
        );*/
2530
        // mark all requested processes as non-active
2531
        $queryBuilder
2532
            ->update('tx_crawler_process')
2533
            ->where(
2534
                'NOT EXISTS (
2535
                SELECT * FROM tx_crawler_queue
2536
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2537
                    AND tx_crawler_queue.exec_time = 0
2538
                )',
2539
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2540
                $queryBuilder->expr()->eq('deleted', 0)
2541
            )
2542
            ->set('active', 0)
2543
            ->execute();
2544
        $queryBuilder->resetQueryPart('set');
2545
        $queryBuilder
2546
            ->update('tx_crawler_queue')
2547
            ->where(
2548
                $queryBuilder->expr()->eq('exec_time', 0),
2549
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2550
            )
2551
            ->set('process_scheduled', 0)
2552
            ->set('process_id', '')
2553
            ->execute();
2554
2555
        if (!$withinLock) {
2556
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2557
        }
2558
2559
        return true;
2560
    }
2561
2562
    /**
2563
     * Check if there are still resources left for the process with the given id
2564
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2565
     *
2566
     * @param  string  identification string for the process
2567
     * @return boolean determines if the process is still active / has resources
2568
     *
2569
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2570
     */
2571 1
    public function CLI_checkIfProcessIsActive($pid)
2572
    {
2573 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2574 1
        $ret = false;
2575
2576
        $statement = $queryBuilder
2577 1
            ->from('tx_crawler_process')
2578 1
            ->select('active')
2579 1
            ->where(
2580 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2581
            )
2582 1
            ->orderBy('ttl')
2583 1
            ->execute();
2584
2585 1
        if ($row = $statement->fetch(0)) {
2586 1
            $ret = intVal($row['active']) == 1;
2587
        }
2588
2589 1
        return $ret;
2590
    }
2591
2592
    /**
2593
     * Create a unique Id for the current process
2594
     *
2595
     * @return string  the ID
2596
     */
2597 2
    public function CLI_buildProcessId()
2598
    {
2599 2
        if (!$this->processID) {
2600 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2601
        }
2602 2
        return $this->processID;
2603
    }
2604
2605
    /**
2606
     * @param bool $get_as_float
2607
     *
2608
     * @return mixed
2609
     */
2610
    protected function microtime($get_as_float = false)
2611
    {
2612
        return microtime($get_as_float);
2613
    }
2614
2615
    /**
2616
     * Prints a message to the stdout (only if debug-mode is enabled)
2617
     *
2618
     * @param  string $msg  the message
2619
     */
2620
    public function CLI_debug($msg)
2621
    {
2622
        if (intval($this->extensionSettings['processDebug'])) {
2623
            echo $msg . "\n";
2624
            flush();
2625
        }
2626
    }
2627
2628
    /**
2629
     * Get URL content by making direct request to TYPO3.
2630
     *
2631
     * @param  string $url          Page URL
2632
     * @param  int    $crawlerId    Crawler-ID
2633
     * @return array
2634
     */
2635 2
    protected function sendDirectRequest($url, $crawlerId)
2636
    {
2637 2
        $parsedUrl = parse_url($url);
2638 2
        if (!is_array($parsedUrl)) {
2639
            return [];
2640
        }
2641
2642 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2643
2644 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2645 2
        $cmd .= ' ';
2646 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2647 2
        $cmd .= ' ';
2648 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2649 2
        $cmd .= ' ';
2650 2
        $cmd .= escapeshellarg($url);
2651 2
        $cmd .= ' ';
2652 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2653
2654 2
        $startTime = microtime(true);
2655 2
        $content = $this->executeShellCommand($cmd);
2656 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2657
2658
        $result = [
2659 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2660 2
            'headers' => '',
2661 2
            'content' => $content
2662
        ];
2663
2664 2
        return $result;
2665
    }
2666
2667
    /**
2668
     * Cleans up entries that stayed for too long in the queue. These are:
2669
     * - processed entries that are over 1.5 days in age
2670
     * - scheduled entries that are over 7 days old
2671
     *
2672
     * @return void
2673
     */
2674
    public function cleanUpOldQueueEntries()
2675
    {
2676
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2677
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2678
2679
        $now = time();
2680
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2681
        $this->flushQueue($condition);
2682
    }
2683
2684
    /**
2685
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2686
     *
2687
     * @param int $id
2688
     * @param int $typeNum
2689
     *
2690
     * @return void
2691
     */
2692
    protected function initTSFE($id = 1, $typeNum = 0)
2693
    {
2694
        EidUtility::initTCA();
2695
        if (!is_object($GLOBALS['TT'])) {
2696
            $GLOBALS['TT'] = new TimeTracker(false);
2697
            $GLOBALS['TT']->start();
2698
        }
2699
2700
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2701
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2702
        $GLOBALS['TSFE']->sys_page->init(true);
2703
        $GLOBALS['TSFE']->initFEuser();
2704
        $GLOBALS['TSFE']->determineId();
2705
        $GLOBALS['TSFE']->initTemplate();
2706
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2707
        $GLOBALS['TSFE']->getConfigArray();
2708
    }
2709
2710
    /**
2711
     * Returns a md5 hash generated from a serialized configuration array.
2712
     *
2713
     * @param array $configuration
2714
     *
2715
     * @return string
2716
     */
2717 7
    protected function getConfigurationHash(array $configuration)
2718
    {
2719 7
        unset($configuration['paramExpanded']);
2720 7
        unset($configuration['URLs']);
2721 7
        return md5(serialize($configuration));
2722
    }
2723
2724
    /**
2725
     * Check whether the Crawling Protocol should be http or https
2726
     *
2727
     * @param $crawlerConfiguration
2728
     * @param $pageConfiguration
2729
     *
2730
     * @return bool
2731
     */
2732 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration)
2733
    {
2734 6
        switch ($crawlerConfiguration) {
2735
            case -1:
2736 1
                return false;
2737 5
            case 0:
2738 3
                return $pageConfiguration;
2739 2
            case 1:
2740 1
                return true;
2741
            default:
2742 1
                return false;
2743
        }
2744
    }
2745
}
2746