Completed
Push — typo3v9 ( a731bf...b7e311 )
by Tomas Norre
05:36
created

CrawlerController::getLogger()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 0
dl 0
loc 7
ccs 0
cts 4
cp 0
crap 6
rs 10
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use Psr\Log\LoggerAwareInterface;
35
use Psr\Log\LoggerAwareTrait;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Database\Connection;
40
use TYPO3\CMS\Core\Database\ConnectionPool;
41
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
42
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
43
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
44
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
45
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
46
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
47
use TYPO3\CMS\Core\Utility\DebugUtility;
48
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
49
use TYPO3\CMS\Core\Utility\GeneralUtility;
50
use TYPO3\CMS\Core\Utility\MathUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
use TYPO3\CMS\Frontend\Utility\EidUtility;
55
use TYPO3\CMS\Lang\LanguageService;
56
57
/**
58
 * Class CrawlerController
59
 *
60
 * @package AOE\Crawler\Controller
61
 */
62
class CrawlerController implements LoggerAwareInterface
63
{
64
    use LoggerAwareTrait;
65
66
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
67
    const CLI_STATUS_REMAIN = 1; //queue not empty
68
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
69
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
70
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
71
72
    /**
73
     * @var integer
74
     */
75
    public $setID = 0;
76
77
    /**
78
     * @var string
79
     */
80
    public $processID = '';
81
82
    /**
83
     * One hour is max stalled time for the CLI
84
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
85
     *
86
     * @var integer
87
     */
88
    public $max_CLI_exec_time = 3600;
89
90
    /**
91
     * @var array
92
     */
93
    public $duplicateTrack = [];
94
95
    /**
96
     * @var array
97
     */
98
    public $downloadUrls = [];
99
100
    /**
101
     * @var array
102
     */
103
    public $incomingProcInstructions = [];
104
105
    /**
106
     * @var array
107
     */
108
    public $incomingConfigurationSelection = [];
109
110
    /**
111
     * @var bool
112
     */
113
    public $registerQueueEntriesInternallyOnly = false;
114
115
    /**
116
     * @var array
117
     */
118
    public $queueEntries = [];
119
120
    /**
121
     * @var array
122
     */
123
    public $urlList = [];
124
125
    /**
126
     * @var array
127
     */
128
    public $extensionSettings = [];
129
130
    /**
131
     * Mount Point
132
     *
133
     * @var boolean
134
     */
135
    public $MP = false;
136
137
    /**
138
     * @var string
139
     */
140
    protected $processFilename;
141
142
    /**
143
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
144
     *
145
     * @var string
146
     */
147
    protected $accessMode;
148
149
    /**
150
     * @var BackendUserAuthentication
151
     */
152
    private $backendUser;
153
154
    /**
155
     * @var integer
156
     */
157
    private $scheduledTime = 0;
158
159
    /**
160
     * @var integer
161
     */
162
    private $reqMinute = 0;
163
164
    /**
165
     * @var bool
166
     */
167
    private $submitCrawlUrls = false;
168
169
    /**
170
     * @var bool
171
     */
172
    private $downloadCrawlUrls = false;
173
174
    /**
175
     * @var QueueRepository
176
     */
177
    protected $queueRepository;
178
179
    /**
180
     * @var ProcessRepository
181
     */
182
    protected $processRepository;
183
184
    /**
185
     * @var string
186
     */
187
    protected $tableName = 'tx_crawler_queue';
188
189
    /**
190
     * @var array
191
     */
192
    private $cliArgs;
193
194
    /**
195
     * Method to set the accessMode can be gui, cli or cli_im
196
     *
197
     * @return string
198
     */
199 1
    public function getAccessMode()
200
    {
201 1
        return $this->accessMode;
202
    }
203
204
    /**
205
     * @param string $accessMode
206
     */
207 1
    public function setAccessMode($accessMode)
208
    {
209 1
        $this->accessMode = $accessMode;
210 1
    }
211
212
    /**
213
     * Set disabled status to prevent processes from being processed
214
     *
215
     * @param  bool $disabled (optional, defaults to true)
216
     * @return void
217
     */
218 3
    public function setDisabled($disabled = true)
219
    {
220 3
        if ($disabled) {
221 2
            GeneralUtility::writeFile($this->processFilename, '');
222
        } else {
223 1
            if (is_file($this->processFilename)) {
224 1
                unlink($this->processFilename);
225
            }
226
        }
227 3
    }
228
229
    /**
230
     * Get disable status
231
     *
232
     * @return bool true if disabled
233
     */
234 3
    public function getDisabled()
235
    {
236 3
        return is_file($this->processFilename);
237
    }
238
239
    /**
240
     * @param string $filenameWithPath
241
     *
242
     * @return void
243
     */
244 4
    public function setProcessFilename($filenameWithPath)
245
    {
246 4
        $this->processFilename = $filenameWithPath;
247 4
    }
248
249
    /**
250
     * @return string
251
     */
252 1
    public function getProcessFilename()
253
    {
254 1
        return $this->processFilename;
255
    }
256
257
    /************************************
258
     *
259
     * Getting URLs based on Page TSconfig
260
     *
261
     ************************************/
262
263 31
    public function __construct()
264
    {
265 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
266 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
267 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
268
269 31
        $this->backendUser = $GLOBALS['BE_USER'];
270 31
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
271
272
        /** @var ExtensionConfigurationProvider $configurationProvider */
273 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
274 31
        $settings = $configurationProvider->getExtensionConfiguration();
275 31
        $settings = is_array($settings) ? $settings : [];
276
277
        // read ext_em_conf_template settings and set
278 31
        $this->setExtensionSettings($settings);
279
280
        // set defaults:
281 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
282
            $this->extensionSettings['countInARun'] = 100;
283
        }
284
285 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
286 31
    }
287
288
    /**
289
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
290
     *
291
     * @param array $extensionSettings
292
     * @return void
293
     */
294 40
    public function setExtensionSettings(array $extensionSettings)
295
    {
296 40
        $this->extensionSettings = $extensionSettings;
297 40
    }
298
299
    /**
300
     * Check if the given page should be crawled
301
     *
302
     * @param array $pageRow
303
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
304
     */
305 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
306
    {
307 8
        $skipPage = false;
308 8
        $skipMessage = 'Skipped'; // message will be overwritten later
309
310
        // if page is hidden
311 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
312 8
            if ($pageRow['hidden']) {
313 1
                $skipPage = true;
314 1
                $skipMessage = 'Because page is hidden';
315
            }
316
        }
317
318 8
        if (!$skipPage) {
319 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
320 3
                $skipPage = true;
321 3
                $skipMessage = 'Because doktype is not allowed';
322
            }
323
        }
324
325 8
        if (!$skipPage) {
326 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
327 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
328 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
329 1
                        $skipPage = true;
330 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
331 1
                        break;
332
                    }
333
                }
334
            }
335
        }
336
337 8
        if (!$skipPage) {
338
            // veto hook
339 3
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
340
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
341
                    $params = [
342
                        'pageRow' => $pageRow
343
                    ];
344
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
345
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
346
                    if ($veto !== false) {
347
                        $skipPage = true;
348
                        if (is_string($veto)) {
349
                            $skipMessage = $veto;
350
                        } else {
351
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
352
                        }
353
                        // no need to execute other hooks if a previous one return a veto
354
                        break;
355
                    }
356
                }
357
            }
358
        }
359
360 8
        return $skipPage ? $skipMessage : false;
361
    }
362
363
    /**
364
     * Wrapper method for getUrlsForPageId()
365
     * It returns an array of configurations and no urls!
366
     *
367
     * @param array $pageRow Page record with at least dok-type and uid columns.
368
     * @param string $skipMessage
369
     * @return array
370
     * @see getUrlsForPageId()
371
     */
372 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
373
    {
374 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
375
376 4
        if ($message === false) {
377 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
378 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
379 3
            $skipMessage = '';
380
        } else {
381 1
            $skipMessage = $message;
382 1
            $res = [];
383
        }
384
385 4
        return $res;
386
    }
387
388
    /**
389
     * This method is used to count if there are ANY unprocessed queue entries
390
     * of a given page_id and the configuration which matches a given hash.
391
     * If there if none, we can skip an inner detail check
392
     *
393
     * @param  int $uid
394
     * @param  string $configurationHash
395
     * @return boolean
396
     */
397 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
398
    {
399 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
400 5
        $noUnprocessedQueueEntriesFound = true;
401
402
        $result = $queryBuilder
403 5
            ->count('*')
404 5
            ->from($this->tableName)
405 5
            ->where(
406 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
407 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
408 5
                $queryBuilder->expr()->eq('exec_time', 0)
409
            )
410 5
            ->execute()
411 5
            ->fetchColumn();
412
413 5
        if ($result) {
414 3
            $noUnprocessedQueueEntriesFound = false;
415
        }
416
417 5
        return $noUnprocessedQueueEntriesFound;
418
    }
419
420
    /**
421
     * Creates a list of URLs from input array (and submits them to queue if asked for)
422
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
423
     *
424
     * @param    array        Information about URLs from pageRow to crawl.
425
     * @param    array        Page row
426
     * @param    integer        Unix time to schedule indexing to, typically time()
427
     * @param    integer        Number of requests per minute (creates the interleave between requests)
428
     * @param    boolean        If set, submits the URLs to queue
429
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
430
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
431
     * @param    array        Array which will be filled with URLS for download if flag is set.
432
     * @param    array        Array of processing instructions
433
     * @return    string        List of URLs (meant for display in backend module)
434
     *
435
     */
436 2
    public function urlListFromUrlArray(
437
        array $vv,
438
        array $pageRow,
439
        $scheduledTime,
440
        $reqMinute,
441
        $submitCrawlUrls,
442
        $downloadCrawlUrls,
443
        array &$duplicateTrack,
444
        array &$downloadUrls,
445
        array $incomingProcInstructions
446
    ) {
447 2
        $urlList = '';
448
449 2
        if (is_array($vv['URLs'])) {
450 2
            $configurationHash = $this->getConfigurationHash($vv);
451 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
452
453 2
            foreach ($vv['URLs'] as $urlQuery) {
454 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
455
456
                    // Calculate cHash:
457 2
                    if ($vv['subCfg']['cHash']) {
458
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
459
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
460
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
461
                    }
462
463
                    // Create key by which to determine unique-ness:
464 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
465 2
                    $urlQuery = 'index.php' . $urlQuery;
466
467
                    // Scheduled time:
468 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
469 2
                    $schTime = floor($schTime / 60) * 60;
470
471 2
                    if (isset($duplicateTrack[$uKey])) {
472
473
                        //if the url key is registered just display it and do not resubmit is
474
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
475
                    } else {
476 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
477 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
478
479 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
480
481
                        // Submit for crawling!
482 2
                        if ($submitCrawlUrls) {
483 2
                            $added = $this->addUrl(
484 2
                                $pageRow['uid'],
485 2
                                $theUrl,
486 2
                                $vv['subCfg'],
487 2
                                $scheduledTime,
488 2
                                $configurationHash,
489 2
                                $skipInnerCheck
490
                            );
491 2
                            if ($added === false) {
492 2
                                $urlList .= ' (Url already existed)';
493
                            }
494
                        } elseif ($downloadCrawlUrls) {
495
                            $downloadUrls[$theUrl] = $theUrl;
496
                        }
497
498 2
                        $urlList .= '<br />';
499
                    }
500 2
                    $duplicateTrack[$uKey] = true;
501
                }
502
            }
503
        } else {
504
            $urlList = 'ERROR - no URL generated';
505
        }
506
507 2
        return $urlList;
508
    }
509
510
    /**
511
     * Returns true if input processing instruction is among registered ones.
512
     *
513
     * @param string $piString PI to test
514
     * @param array $incomingProcInstructions Processing instructions
515
     * @return boolean
516
     */
517 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
518
    {
519 5
        if (empty($incomingProcInstructions)) {
520 1
            return true;
521
        }
522
523 4
        foreach ($incomingProcInstructions as $pi) {
524 4
            if (GeneralUtility::inList($piString, $pi)) {
525 2
                return true;
526
            }
527
        }
528 2
    }
529
530 2
    public function getPageTSconfigForId($id)
531
    {
532 2
        if (!$this->MP) {
533 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
534
        } else {
535
            list(, $mountPointId) = explode('-', $this->MP);
536
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
537
        }
538
539
        // Call a hook to alter configuration
540 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
541
            $params = [
542
                'pageId' => $id,
543
                'pageTSConfig' => &$pageTSconfig
544
            ];
545
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
546
                GeneralUtility::callUserFunction($userFunc, $params, $this);
547
            }
548
        }
549
550 2
        return $pageTSconfig;
551
    }
552
553
    /**
554
     * This methods returns an array of configurations.
555
     * And no urls!
556
     *
557
     * @param integer $id Page ID
558
     * @param bool $forceSsl Use https
559
     * @return array
560
     */
561 2
    public function getUrlsForPageId($id, $forceSsl = false)
562
    {
563
564
        /**
565
         * Get configuration from tsConfig
566
         */
567
568
        // Get page TSconfig for page ID:
569 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
570
571 2
        $res = [];
572
573 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
574 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
575
576 1
            if (is_array($crawlerCfg['paramSets.'])) {
577 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
578 1
                    if (is_array($values)) {
579 1
                        $key = str_replace('.', '', $key);
580
                        // Sub configuration for a single configuration string:
581 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
582 1
                        $subCfg['key'] = $key;
583
584 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
585 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
586
                        }
587 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
588
589
                        // process configuration if it is not page-specific or if the specific page is the current page:
590 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
591
592
                                // add trailing slash if not present
593 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
594
                                $subCfg['baseUrl'] .= '/';
595
                            }
596
597
                            // Explode, process etc.:
598 1
                            $res[$key] = [];
599 1
                            $res[$key]['subCfg'] = $subCfg;
600 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
601 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
602 1
                            $res[$key]['origin'] = 'pagets';
603
604
                            // recognize MP value
605 1
                            if (!$this->MP) {
606 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
607
                            } else {
608
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
609
                            }
610
                        }
611
                    }
612
                }
613
            }
614
        }
615
616
        /**
617
         * Get configuration from tx_crawler_configuration records
618
         */
619
620
        // get records along the rootline
621 2
        $rootLine = BackendUtility::BEgetRootLine($id);
622
623 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
624
        $queryBuilder
625 2
            ->getRestrictions()->removeAll()
626 2
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
627 2
            ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
628
629 2
        foreach ($rootLine as $page) {
630
            $configurationRecordsForCurrentPage = $queryBuilder
631 2
                ->select('*')
632 2
                ->from('tx_crawler_configuration')
633 2
                ->where(
634 2
                    $queryBuilder->expr()->eq('pid', $page['uid'])
635
                )
636 2
                ->execute()
637 2
                ->fetchAll();
638
639 2
            if (is_array($configurationRecordsForCurrentPage)) {
640 2
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
641
642
                        // check access to the configuration record
643 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
644 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
645
646
                        // process configuration if it is not page-specific or if the specific page is the current page:
647 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
648 1
                            $key = $configurationRecord['name'];
649
650
                            // don't overwrite previously defined paramSets
651 1
                            if (!isset($res[$key])) {
652
653
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
654 1
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
655 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
656
657 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
658
659
                                $subCfg = [
660 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
661 1
                                    'procInstrParams.' => $TSparserObject->setup,
662 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
663 1
                                        $configurationRecord['base_url'],
664 1
                                        $configurationRecord['sys_domain_base_url'],
665 1
                                        $isCrawlingProtocolHttps
666
                                    ),
667 1
                                    'cHash' => $configurationRecord['chash'],
668 1
                                    'userGroups' => $configurationRecord['fegroups'],
669 1
                                    'exclude' => $configurationRecord['exclude'],
670 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
671 1
                                    'key' => $key
672
                                ];
673
674
                                // add trailing slash if not present
675 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
676
                                    $subCfg['baseUrl'] .= '/';
677
                                }
678 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
679 1
                                    $res[$key] = [];
680 1
                                    $res[$key]['subCfg'] = $subCfg;
681 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
682 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
683 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
684 1
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
685
                                }
686
                            }
687
                        }
688
                    }
689
                }
690
            }
691
        }
692
693 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
694
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
695
                $params = [
696
                    'res' => &$res,
697
                ];
698
                GeneralUtility::callUserFunction($func, $params, $this);
699
            }
700
        }
701
702 2
        return $res;
703
    }
704
705
    /**
706
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
707
     *
708
     * @param string $baseUrl
709
     * @param integer $sysDomainUid
710
     * @param bool $ssl
711
     * @return string
712
     */
713 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
714
    {
715 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
716 4
        $sysDomainUid = intval($sysDomainUid);
717 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
718
719 4
        if ($sysDomainUid > 0) {
720
            $statement = $queryBuilder
721 2
                ->from('sys_domain')
722 2
                ->select('*')
723 2
                ->where(
724 2
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
725
                )
726 2
                ->execute();
727
728 2
            $row = $statement->fetch(0);
729 2
            if ($row['domainName'] != '') {
730 1
                return $urlScheme . '://' . $row['domainName'];
731
            }
732
        }
733 3
        return $baseUrl;
734
    }
735
736
    /**
737
     * @param $rootid
738
     * @param $depth
739
     * @return array
740
     *
741
     * TODO: Write Functional Tests
742
     */
743
    public function getConfigurationsForBranch($rootid, $depth)
744
    {
745
        $configurationsForBranch = [];
746
747
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
748
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
749
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
750
            if (is_array($sets)) {
751
                foreach ($sets as $key => $value) {
752
                    if (!is_array($value)) {
753
                        continue;
754
                    }
755
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
756
                }
757
            }
758
        }
759
        $pids = [];
760
        $rootLine = BackendUtility::BEgetRootLine($rootid);
761
        foreach ($rootLine as $node) {
762
            $pids[] = $node['uid'];
763
        }
764
        /* @var PageTreeView $tree */
765
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
766
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
767
        $tree->init('AND ' . $perms_clause);
768
        $tree->getTree($rootid, $depth, '');
769
        foreach ($tree->tree as $node) {
770
            $pids[] = $node['row']['uid'];
771
        }
772
773
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
774
775
        $queryBuilder->getRestrictions()
776
            ->removeAll()
777
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
778
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
779
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
780
781
        $statement = $queryBuilder
782
            ->select('name')
783
            ->from('tx_crawler_configuration')
784
            ->where(
785
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
786
            )
787
        ->execute();
788
789
        while ($row = $statement->fetch()) {
790
            $configurationsForBranch[] = $row['name'];
791
        }
792
793
        return $configurationsForBranch;
794
    }
795
796
    /**
797
     * Get querybuilder for given table
798
     *
799
     * @param string $table
800
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
801
     */
802 9
    private function getQueryBuilder(string $table)
803
    {
804 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
805 9
            ->getConnectionForTable($table)
806 9
            ->createQueryBuilder();
807
    }
808
809
    /**
810
     * Check if a user has access to an item
811
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
812
     *
813
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
814
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
815
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
816
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
817
     */
818 3
    public function hasGroupAccess($groupList, $accessList)
819
    {
820 3
        if (empty($accessList)) {
821 1
            return true;
822
        }
823 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
824 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
825 1
                return true;
826
            }
827
        }
828 1
        return false;
829
    }
830
831
    /**
832
     * Parse GET vars of input Query into array with key=>value pairs
833
     *
834
     * @param string $inputQuery Input query string
835
     * @return array
836
     */
837 5
    public function parseParams($inputQuery)
838
    {
839
        //echo '<pre>', var_dump($inputQuery), '</pre>';
840
        // Extract all GET parameters into an ARRAY:
841 5
        $paramKeyValues = [];
842 5
        $GETparams = explode('&', $inputQuery);
843
844 5
        foreach ($GETparams as $paramAndValue) {
845 5
            list($p, $v) = explode('=', $paramAndValue, 2);
846 5
            if (strlen($p)) {
847 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
848
            }
849
        }
850
851 5
        return $paramKeyValues;
852
    }
853
854
    /**
855
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
856
     * Syntax of values:
857
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
858
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
859
     * - For each configuration part:
860
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
861
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
862
     *        _ENABLELANG:1 picks only original records without their language overlays
863
     *         - Default: Literal value
864
     *
865
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
866
     * @param integer $pid Current page ID
867
     * @return array
868
     *
869
     * TODO: Write Functional Tests
870
     */
871 2
    public function expandParameters($paramArray, $pid)
872
    {
873 2
        global $TCA;
874
875
        // Traverse parameter names:
876 2
        foreach ($paramArray as $p => $v) {
877 2
            $v = trim($v);
878
879
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
880 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
881
                // So, find the value inside brackets and reset the paramArray value as an array.
882 2
                $v = substr($v, 1, -1);
883 2
                $paramArray[$p] = [];
884
885
                // Explode parts and traverse them:
886 2
                $parts = explode('|', $v);
887 2
                foreach ($parts as $pV) {
888
889
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
890 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
891
892
                        // Swap if first is larger than last:
893
                        if ($reg[1] > $reg[2]) {
894
                            $temp = $reg[2];
895
                            $reg[2] = $reg[1];
896
                            $reg[1] = $temp;
897
                        }
898
899
                        // Traverse range, add values:
900
                        $runAwayBrake = 1000; // Limit to size of range!
901
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
902
                            $paramArray[$p][] = $a;
903
                            $runAwayBrake--;
904
                            if ($runAwayBrake <= 0) {
905
                                break;
906
                            }
907
                        }
908 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
909
910
                        // Parse parameters:
911
                        $subparts = GeneralUtility::trimExplode(';', $pV);
912
                        $subpartParams = [];
913
                        foreach ($subparts as $spV) {
914
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
915
                            $subpartParams[$pKey] = $pVal;
916
                        }
917
918
                        // Table exists:
919
                        if (isset($TCA[$subpartParams['_TABLE']])) {
920
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
921
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
922
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
923
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
924
925
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
926
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
927
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
928
929
                                $queryBuilder->getRestrictions()
930
                                    ->removeAll()
931
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
932
933
                                $queryBuilder
934
                                    ->select($fieldName)
935
                                    ->from($subpartParams['_TABLE'])
936
                                    // TODO: Check if this works as intended!
937
                                    ->add('from', $addTable)
938
                                    ->where(
939
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
940
                                        $where
941
                                    );
942
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
943
944
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
945
                                    $queryBuilder->andWhere(
946
                                        $queryBuilder->expr()->lte(
947
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
948
                                            0
949
                                        )
950
                                    );
951
                                }
952
953
                                $statement = $queryBuilder->execute();
954
955
                                $rows = [];
956
                                while ($row = $statement->fetch()) {
957
                                    $rows[$fieldName] = $row;
958
                                }
959
960
                                if (is_array($rows)) {
961
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
962
                                }
963
                            }
964
                        }
965
                    } else { // Just add value:
966 2
                        $paramArray[$p][] = $pV;
967
                    }
968
                    // Hook for processing own expandParameters place holder
969 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
970
                        $_params = [
971
                            'pObj' => &$this,
972
                            'paramArray' => &$paramArray,
973
                            'currentKey' => $p,
974
                            'currentValue' => $pV,
975
                            'pid' => $pid
976
                        ];
977
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
978
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
979
                        }
980
                    }
981
                }
982
983
                // Make unique set of values and sort array by key:
984 2
                $paramArray[$p] = array_unique($paramArray[$p]);
985 2
                ksort($paramArray);
986
            } else {
987
                // Set the literal value as only value in array:
988 2
                $paramArray[$p] = [$v];
989
            }
990
        }
991
992 2
        return $paramArray;
993
    }
994
995
    /**
996
     * Compiling URLs from parameter array (output of expandParameters())
997
     * The number of URLs will be the multiplication of the number of parameter values for each key
998
     *
999
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1000
     * @param array $urls URLs accumulated in this array (for recursion)
1001
     * @return array
1002
     */
1003 5
    public function compileUrls($paramArray, $urls = [])
1004
    {
1005 5
        if (count($paramArray) && is_array($urls)) {
1006
            // shift first off stack:
1007 4
            reset($paramArray);
1008 4
            $varName = key($paramArray);
1009 4
            $valueSet = array_shift($paramArray);
1010
1011
            // Traverse value set:
1012 4
            $newUrls = [];
1013 4
            foreach ($urls as $url) {
1014 3
                foreach ($valueSet as $val) {
1015 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1016
1017 3
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1018
                        break;
1019
                    }
1020
                }
1021
            }
1022 4
            $urls = $newUrls;
1023 4
            $urls = $this->compileUrls($paramArray, $urls);
1024
        }
1025
1026 5
        return $urls;
1027
    }
1028
1029
    /************************************
1030
     *
1031
     * Crawler log
1032
     *
1033
     ************************************/
1034
1035
    /**
1036
     * Return array of records from crawler queue for input page ID
1037
     *
1038
     * @param integer $id Page ID for which to look up log entries.
1039
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1040
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1041
     * @param boolean $doFullFlush
1042
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1043
     * @return array
1044
     */
1045 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1046
    {
1047 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1048
        $queryBuilder
1049 4
            ->select('*')
1050 4
            ->from($this->tableName)
1051 4
            ->where(
1052 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1053
            )
1054 4
            ->orderBy('scheduled', 'DESC');
1055
1056 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1057 4
            ->getConnectionForTable($this->tableName)
1058 4
            ->getExpressionBuilder();
1059 4
        $query = $expressionBuilder->andX();
1060
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1061
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1062
        // between the statements, it's not a mistake in the code.
1063 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1064 4
        switch ($filter) {
1065 4
            case 'pending':
1066
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1067
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1068
                break;
1069 4
            case 'finished':
1070
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1071
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1072
                break;
1073
        }
1074
1075
        // FIXME: Write unit test that ensures that the right records are deleted.
1076 4
        if ($doFlush) {
1077 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1078 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1079 2
            return [];
1080
        } else {
1081 2
            if ($itemsPerPage > 0) {
1082
                $queryBuilder
1083 2
                    ->setMaxResults((int)$itemsPerPage);
1084
            }
1085
1086 2
            return $queryBuilder->execute()->fetchAll();
1087
        }
1088
    }
1089
1090
    /**
1091
     * Return array of records from crawler queue for input set ID
1092
     *
1093
     * @param integer $set_id Set ID for which to look up log entries.
1094
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1095
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1096
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1097
     * @return array
1098
     */
1099 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1100
    {
1101 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1102
        $queryBuilder
1103 6
            ->select('*')
1104 6
            ->from($this->tableName)
1105 6
            ->where(
1106 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1107
            )
1108 6
            ->orderBy('scheduled', 'DESC');
1109
1110 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1111 6
            ->getConnectionForTable($this->tableName)
1112 6
            ->getExpressionBuilder();
1113 6
        $query = $expressionBuilder->andX();
1114
        // FIXME: Write Unit tests for Filters
1115
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1116
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1117
        // between the statements, it's not a mistake in the code.
1118 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1119 6
        switch ($filter) {
1120 6
            case 'pending':
1121 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1122 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1123 1
                break;
1124 5
            case 'finished':
1125 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1126 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1127 1
                break;
1128
        }
1129
        // FIXME: Write unit test that ensures that the right records are deleted.
1130 6
        if ($doFlush) {
1131 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1132 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1133 4
            return [];
1134
        } else {
1135 2
            if ($itemsPerPage > 0) {
1136
                $queryBuilder
1137 2
                    ->setMaxResults((int)$itemsPerPage);
1138
            }
1139
1140 2
            return $queryBuilder->execute()->fetchAll();
1141
        }
1142
    }
1143
1144
    /**
1145
     * Removes queue entries
1146
     *
1147
     * @param string $where SQL related filter for the entries which should be removed
1148
     * @return void
1149
     */
1150 9
    protected function flushQueue($where = '')
1151
    {
1152 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1153
1154 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1155
1156 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1157
            $groups = $queryBuilder
1158
                ->select('DISTINCT set_id')
1159
                ->from($this->tableName)
1160
                ->where($realWhere)
1161
                ->execute()
1162
                ->fetchAll();
1163
            if (is_array($groups)) {
1164
                foreach ($groups as $group) {
1165
                    $subSet = $queryBuilder
1166
                        ->select('uid', 'set_id')
1167
                        ->from($this->tableName)
1168
                        ->where(
1169
                            $realWhere,
1170
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1171
                        )
1172
                        ->execute()
1173
                        ->fetchAll();
1174
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1175
                }
1176
            }
1177
        }
1178
1179
        $queryBuilder
1180 9
            ->delete($this->tableName)
1181 9
            ->where($realWhere)
1182 9
            ->execute();
1183 9
    }
1184
1185
    /**
1186
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1187
     *
1188
     * @param integer $setId Set ID
1189
     * @param array $params Parameters to pass to call back function
1190
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1191
     * @param integer $page_id Page ID to attach it to
1192
     * @param integer $schedule Time at which to activate
1193
     * @return void
1194
     */
1195
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1196
    {
1197
        if (!is_array($params)) {
1198
            $params = [];
1199
        }
1200
        $params['_CALLBACKOBJ'] = $callBack;
1201
1202
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1203
            ->insert(
1204
                'tx_crawler_queue',
1205
                [
1206
                    'page_id' => intval($page_id),
1207
                    'parameters' => serialize($params),
1208
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1209
                    'exec_time' => 0,
1210
                    'set_id' => intval($setId),
1211
                    'result_data' => '',
1212
                ]
1213
            );
1214
    }
1215
1216
    /************************************
1217
     *
1218
     * URL setting
1219
     *
1220
     ************************************/
1221
1222
    /**
1223
     * Setting a URL for crawling:
1224
     *
1225
     * @param integer $id Page ID
1226
     * @param string $url Complete URL
1227
     * @param array $subCfg Sub configuration array (from TS config)
1228
     * @param integer $tstamp Scheduled-time
1229
     * @param string $configurationHash (optional) configuration hash
1230
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1231
     * @return bool
1232
     */
1233 2
    public function addUrl(
1234
        $id,
1235
        $url,
1236
        array $subCfg,
1237
        $tstamp,
1238
        $configurationHash = '',
1239
        $skipInnerDuplicationCheck = false
1240
    ) {
1241 2
        $urlAdded = false;
1242 2
        $rows = [];
1243
1244
        // Creating parameters:
1245
        $parameters = [
1246 2
            'url' => $url
1247
        ];
1248
1249
        // fe user group simulation:
1250 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1251 2
        if ($uGs) {
1252
            $parameters['feUserGroupList'] = $uGs;
1253
        }
1254
1255
        // Setting processing instructions
1256 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1257 2
        if (is_array($subCfg['procInstrParams.'])) {
1258 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1259
        }
1260
1261
        // Possible TypoScript Template Parents
1262 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1263
1264
        // Compile value array:
1265 2
        $parameters_serialized = serialize($parameters);
1266
        $fieldArray = [
1267 2
            'page_id' => intval($id),
1268 2
            'parameters' => $parameters_serialized,
1269 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1270 2
            'configuration_hash' => $configurationHash,
1271 2
            'scheduled' => $tstamp,
1272 2
            'exec_time' => 0,
1273 2
            'set_id' => intval($this->setID),
1274 2
            'result_data' => '',
1275 2
            'configuration' => $subCfg['key'],
1276
        ];
1277
1278 2
        if ($this->registerQueueEntriesInternallyOnly) {
1279
            //the entries will only be registered and not stored to the database
1280
            $this->queueEntries[] = $fieldArray;
1281
        } else {
1282 2
            if (!$skipInnerDuplicationCheck) {
1283
                // check if there is already an equal entry
1284 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1285
            }
1286
1287 2
            if (count($rows) == 0) {
1288 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1289 2
                $connectionForCrawlerQueue->insert(
1290 2
                    'tx_crawler_queue',
1291 2
                    $fieldArray
1292
                );
1293 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1294 2
                $rows[] = $uid;
1295 2
                $urlAdded = true;
1296 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1297
            } else {
1298
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1299
            }
1300
        }
1301
1302 2
        return $urlAdded;
1303
    }
1304
1305
    /**
1306
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1307
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1308
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1309
     *
1310
     * @param int $tstamp
1311
     * @param array $fieldArray
1312
     *
1313
     * @return array
1314
     *
1315
     * TODO: Write Functional Tests
1316
     */
1317 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1318
    {
1319 2
        $rows = [];
1320
1321 2
        $currentTime = $this->getCurrentTime();
1322
1323 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1324
        $queryBuilder
1325 2
            ->select('qid')
1326 2
            ->from('tx_crawler_queue');
1327
        //if this entry is scheduled with "now"
1328 2
        if ($tstamp <= $currentTime) {
1329
            if ($this->extensionSettings['enableTimeslot']) {
1330
                $timeBegin = $currentTime - 100;
1331
                $timeEnd = $currentTime + 100;
1332
                $queryBuilder
1333
                    ->where(
1334
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1335
                    )
1336
                    ->orWhere(
1337
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1338
                    );
1339
            } else {
1340
                $queryBuilder
1341
                    ->where(
1342
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1343
                    );
1344
            }
1345 2
        } elseif ($tstamp > $currentTime) {
1346
            //entry with a timestamp in the future need to have the same schedule time
1347
            $queryBuilder
1348 2
                ->where(
1349 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1350
                );
1351
        }
1352
1353
        $statement = $queryBuilder
1354 2
            ->andWhere('exec_time != 0')
1355 2
            ->andWhere('process_id != 0')
1356 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1357 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1358 2
            ->execute();
1359
1360 2
        while ($row = $statement->fetch()) {
1361
            $rows[] = $row['qid'];
1362
        }
1363
1364 2
        return $rows;
1365
    }
1366
1367
    /**
1368
     * Returns the current system time
1369
     *
1370
     * @return int
1371
     */
1372
    public function getCurrentTime()
1373
    {
1374
        return time();
1375
    }
1376
1377
    /************************************
1378
     *
1379
     * URL reading
1380
     *
1381
     ************************************/
1382
1383
    /**
1384
     * Read URL for single queue entry
1385
     *
1386
     * @param integer $queueId
1387
     * @param boolean $force If set, will process even if exec_time has been set!
1388
     * @return integer
1389
     */
1390
    public function readUrl($queueId, $force = false)
1391
    {
1392
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1393
        $ret = 0;
1394
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1395
        // Get entry:
1396
        $queryBuilder
1397
            ->select('*')
1398
            ->from('tx_crawler_queue')
1399
            ->where(
1400
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1401
            );
1402
        if (!$force) {
1403
            $queryBuilder
1404
                ->andWhere('exec_time = 0')
1405
                ->andWhere('process_scheduled > 0');
1406
        }
1407
        $queueRec = $queryBuilder->execute()->fetch();
1408
1409
        if (!is_array($queueRec)) {
1410
            return;
1411
        }
1412
1413
        $parameters = unserialize($queueRec['parameters']);
1414
        if ($parameters['rootTemplatePid']) {
1415
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1416
        } else {
1417
            $this->logger->warning(
1418
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1419
            );
1420
        }
1421
1422
        SignalSlotUtility::emitSignal(
1423
            __CLASS__,
1424
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1425
            [$queueId, &$queueRec]
1426
        );
1427
1428
        // Set exec_time to lock record:
1429
        $field_array = ['exec_time' => $this->getCurrentTime()];
1430
1431
        if (isset($this->processID)) {
1432
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1433
            $field_array['process_id_completed'] = $this->processID;
1434
        }
1435
1436
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1437
            ->update(
1438
                'tx_crawler_queue',
1439
                $field_array,
1440
                [ 'qid' => (int)$queueId ]
1441
            );
1442
1443
        $result = $this->readUrl_exec($queueRec);
1444
        $resultData = unserialize($result['content']);
1445
1446
        //atm there's no need to point to specific pollable extensions
1447
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1448
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1449
                // only check the success value if the instruction is runnig
1450
                // it is important to name the pollSuccess key same as the procInstructions key
1451
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1452
                    $pollable,
1453
                    $resultData['parameters']['procInstructions']
1454
                )
1455
                ) {
1456
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1457
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1458
                    }
1459
                }
1460
            }
1461
        }
1462
1463
        // Set result in log which also denotes the end of the processing of this entry.
1464
        $field_array = ['result_data' => serialize($result)];
1465
1466
        SignalSlotUtility::emitSignal(
1467
            __CLASS__,
1468
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1469
            [$queueId, &$field_array]
1470
        );
1471
1472
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1473
            ->update(
1474
                'tx_crawler_queue',
1475
                $field_array,
1476
                [ 'qid' => (int)$queueId ]
1477
            );
1478
1479
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1480
        return $ret;
1481
    }
1482
1483
    /**
1484
     * Read URL for not-yet-inserted log-entry
1485
     *
1486
     * @param array $field_array Queue field array,
1487
     *
1488
     * @return string
1489
     */
1490
    public function readUrlFromArray($field_array)
1491
    {
1492
1493
            // Set exec_time to lock record:
1494
        $field_array['exec_time'] = $this->getCurrentTime();
1495
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1496
        $connectionForCrawlerQueue->insert(
1497
            'tx_crawler_queue',
1498
            $field_array
1499
        );
1500
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1501
1502
        $result = $this->readUrl_exec($field_array);
1503
1504
        // Set result in log which also denotes the end of the processing of this entry.
1505
        $field_array = ['result_data' => serialize($result)];
1506
1507
        SignalSlotUtility::emitSignal(
1508
            __CLASS__,
1509
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1510
            [$queueId, &$field_array]
1511
        );
1512
1513
        $connectionForCrawlerQueue->update(
1514
            'tx_crawler_queue',
1515
            $field_array,
1516
            ['qid' => $queueId]
1517
        );
1518
1519
        return $result;
1520
    }
1521
1522
    /**
1523
     * Read URL for a queue record
1524
     *
1525
     * @param array $queueRec Queue record
1526
     * @return string
1527
     */
1528
    public function readUrl_exec($queueRec)
1529
    {
1530
        // Decode parameters:
1531
        $parameters = unserialize($queueRec['parameters']);
1532
        $result = 'ERROR';
1533
        if (is_array($parameters)) {
1534
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1535
                $objRef = $parameters['_CALLBACKOBJ'];
1536
                $callBackObj = GeneralUtility::makeInstance($objRef);
1537
                if (is_object($callBackObj)) {
1538
                    unset($parameters['_CALLBACKOBJ']);
1539
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1540
                } else {
1541
                    $result = ['content' => 'No object: ' . $objRef];
1542
                }
1543
            } else { // Regular FE request:
1544
1545
                // Prepare:
1546
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1547
1548
                // Get result:
1549
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1550
1551
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1552
            }
1553
        }
1554
1555
        return $result;
1556
    }
1557
1558
    /**
1559
     * Gets the content of a URL.
1560
     *
1561
     * @param string $originalUrl URL to read
1562
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1563
     * @param integer $timeout Timeout time
1564
     * @param integer $recursion Recursion limiter for 302 redirects
1565
     * @return array|boolean
1566
     */
1567 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1568
    {
1569 2
        if (!$recursion) {
1570
            return false;
1571
        }
1572
1573
        // Parse URL, checking for scheme:
1574 2
        $url = parse_url($originalUrl);
1575
1576 2
        if ($url === false) {
1577
            $this->logger->debug(
1578
                sprintf('Could not parse_url() for string "%s"', $url),
1579
                ['crawlerId' => $crawlerId]
1580
            );
1581
            return false;
1582
        }
1583
1584 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1585
            $this->logger->debug(
1586
                sprintf('Scheme does not match for url "%s"', $url),
1587
                ['crawlerId' => $crawlerId]
1588
            );
1589
            return false;
1590
        }
1591
1592
        // direct request
1593 2
        if ($this->extensionSettings['makeDirectRequests']) {
1594 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1595 2
            return $result;
1596
        }
1597
1598
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1599
1600
        // thanks to Pierrick Caillon for adding proxy support
1601
        $rurl = $url;
1602
1603
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1604
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1605
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1606
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1607
        }
1608
1609
        $host = $rurl['host'];
1610
1611
        if ($url['scheme'] == 'https') {
1612
            $host = 'ssl://' . $host;
1613
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1614
        } else {
1615
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1616
        }
1617
1618
        $startTime = microtime(true);
1619
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1620
1621
        if (!$fp) {
1622
            $this->logger->debug(
1623
                sprintf('Error while opening "%s"', $url),
1624
                ['crawlerId' => $crawlerId]
1625
            );
1626
            return false;
1627
        } else {
1628
            // Request message:
1629
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1630
            fputs($fp, $msg);
1631
1632
            // Read response:
1633
            $d = $this->getHttpResponseFromStream($fp);
1634
            fclose($fp);
1635
1636
            $time = microtime(true) - $startTime;
1637
            $this->log($originalUrl . ' ' . $time);
1638
1639
            // Implode content and headers:
1640
            $result = [
1641
                'request' => $msg,
1642
                'headers' => implode('', $d['headers']),
1643
                'content' => implode('', (array)$d['content'])
1644
            ];
1645
1646
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1647
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1648
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1649
1650
                if (is_array($newRequestUrl)) {
1651
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1652
                } else {
1653
                    $this->logger->debug(
1654
                        sprintf('Error while opening "%s"', $url),
1655
                        ['crawlerId' => $crawlerId]
1656
                    );
1657
                    return false;
1658
                }
1659
            }
1660
1661
            return $result;
1662
        }
1663
    }
1664
1665
    /**
1666
     * Gets the base path of the website frontend.
1667
     * (e.g. if you call http://mydomain.com/cms/index.php in
1668
     * the browser the base path is "/cms/")
1669
     *
1670
     * @return string Base path of the website frontend
1671
     */
1672
    protected function getFrontendBasePath()
1673
    {
1674
        $frontendBasePath = '/';
1675
1676
        // Get the path from the extension settings:
1677
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1678
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1679
        // If empty, try to use config.absRefPrefix:
1680
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1681
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1682
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1683
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1684
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1685
        }
1686
1687
        // Base path must be '/<pathSegements>/':
1688
        if ($frontendBasePath !== '/') {
1689
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1690
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1691
        }
1692
1693
        return $frontendBasePath;
1694
    }
1695
1696
    /**
1697
     * Executes a shell command and returns the outputted result.
1698
     *
1699
     * @param string $command Shell command to be executed
1700
     * @return string Outputted result of the command execution
1701
     */
1702
    protected function executeShellCommand($command)
1703
    {
1704
        return shell_exec($command);
1705
    }
1706
1707
    /**
1708
     * Reads HTTP response from the given stream.
1709
     *
1710
     * @param  resource $streamPointer  Pointer to connection stream.
1711
     * @return array                    Associative array with the following items:
1712
     *                                  headers <array> Response headers sent by server.
1713
     *                                  content <array> Content, with each line as an array item.
1714
     */
1715 1
    protected function getHttpResponseFromStream($streamPointer)
1716
    {
1717 1
        $response = ['headers' => [], 'content' => []];
1718
1719 1
        if (is_resource($streamPointer)) {
1720
            // read headers
1721 1
            while ($line = fgets($streamPointer, '2048')) {
1722 1
                $line = trim($line);
1723 1
                if ($line !== '') {
1724 1
                    $response['headers'][] = $line;
1725
                } else {
1726 1
                    break;
1727
                }
1728
            }
1729
1730
            // read content
1731 1
            while ($line = fgets($streamPointer, '2048')) {
1732 1
                $response['content'][] = $line;
1733
            }
1734
        }
1735
1736 1
        return $response;
1737
    }
1738
1739
    /**
1740
     * In the future this setting "logFileName" should be removed in favor of using the TYPO3 Logging Framework
1741
     * @param string the message string to log
1742
     */
1743 2
    protected function log(string $message): void
1744
    {
1745 2
        if (!empty($this->extensionSettings['logFileName'])) {
1746
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1747
        }
1748 2
        $this->logger->info(
1749 2
            sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1750
        );
1751 2
    }
1752
1753
    /**
1754
     * Builds HTTP request headers.
1755
     *
1756
     * @param array $url
1757
     * @param string $crawlerId
1758
     *
1759
     * @return array
1760
     */
1761 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1762
    {
1763 6
        $reqHeaders = [];
1764 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1765 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1766 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1767 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1768
        }
1769 6
        $reqHeaders[] = 'Connection: close';
1770 6
        if ($url['user'] != '') {
1771 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1772
        }
1773 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1774 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1775 6
        return $reqHeaders;
1776
    }
1777
1778
    /**
1779
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1780
     *
1781
     * @param array $headers HTTP Header
1782
     * @param string $user HTTP Auth. User
1783
     * @param string $pass HTTP Auth. Password
1784
     * @return bool|string
1785
     */
1786 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1787
    {
1788 12
        $header = [];
1789 12
        if (!is_array($headers)) {
1790 1
            return false;
1791
        }
1792 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1793 2
            return false;
1794
        }
1795
1796 9
        foreach ($headers as $hl) {
1797 9
            $tmp = explode(": ", $hl);
1798 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1799 9
            if (trim($tmp[0]) == 'Location') {
1800 6
                break;
1801
            }
1802
        }
1803 9
        if (!array_key_exists('Location', $header)) {
1804 3
            return false;
1805
        }
1806
1807 6
        if ($user != '') {
1808 3
            if (!($tmp = parse_url($header['Location']))) {
1809 1
                return false;
1810
            }
1811 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1812 2
            if ($tmp['query'] != '') {
1813 2
                $newUrl .= '?' . $tmp['query'];
1814
            }
1815
        } else {
1816 3
            $newUrl = $header['Location'];
1817
        }
1818 5
        return $newUrl;
1819
    }
1820
1821
    /**************************
1822
     *
1823
     * tslib_fe hooks:
1824
     *
1825
     **************************/
1826
1827
    /**
1828
     * Initialization hook (called after database connection)
1829
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1830
     *
1831
     * @param array $params Parameters from frontend
1832
     * @param object $ref TSFE object (reference under PHP5)
1833
     * @return void
1834
     *
1835
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1836
     * FIXME: I think this can be removed. (TNM)
1837
     */
1838
    public function fe_init(&$params, $ref)
1839
    {
1840
        // Authenticate crawler request:
1841
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1842
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1843
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1844
1845
            $queueRec = $queryBuilder
1846
                ->select('*')
1847
                ->from('tx_crawler_queue')
1848
                ->where(
1849
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1850
                )
1851
                ->execute()
1852
                ->fetch();
1853
1854
            // If a crawler record was found and hash was matching, set it up:
1855
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1856
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1857
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1858
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1859
            } else {
1860
                die('No crawler entry found!');
1861
            }
1862
        }
1863
    }
1864
1865
    /*****************************
1866
     *
1867
     * Compiling URLs to crawl - tools
1868
     *
1869
     *****************************/
1870
1871
    /**
1872
     * @param integer $id Root page id to start from.
1873
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1874
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1875
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1876
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1877
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1878
     * @param array $incomingProcInstructions Array of processing instructions
1879
     * @param array $configurationSelection Array of configuration keys
1880
     * @return string
1881
     */
1882
    public function getPageTreeAndUrls(
1883
        $id,
1884
        $depth,
1885
        $scheduledTime,
1886
        $reqMinute,
1887
        $submitCrawlUrls,
1888
        $downloadCrawlUrls,
1889
        array $incomingProcInstructions,
1890
        array $configurationSelection
1891
    ) {
1892
        global $LANG;
1893
        if (!is_object($LANG)) {
1894
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1895
            $LANG->init(0);
1896
        }
1897
        $this->scheduledTime = $scheduledTime;
1898
        $this->reqMinute = $reqMinute;
1899
        $this->submitCrawlUrls = $submitCrawlUrls;
1900
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1901
        $this->incomingProcInstructions = $incomingProcInstructions;
1902
        $this->incomingConfigurationSelection = $configurationSelection;
1903
1904
        $this->duplicateTrack = [];
1905
        $this->downloadUrls = [];
1906
1907
        // Drawing tree:
1908
        /* @var PageTreeView $tree */
1909
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1910
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1911
        $tree->init('AND ' . $perms_clause);
1912
1913
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1914
        if (is_array($pageInfo)) {
1915
            // Set root row:
1916
            $tree->tree[] = [
1917
                'row' => $pageInfo,
1918
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1919
            ];
1920
        }
1921
1922
        // Get branch beneath:
1923
        if ($depth) {
1924
            $tree->getTree($id, $depth, '');
1925
        }
1926
1927
        // Traverse page tree:
1928
        $code = '';
1929
1930
        foreach ($tree->tree as $data) {
1931
            $this->MP = false;
1932
1933
            // recognize mount points
1934
            if ($data['row']['doktype'] == 7) {
1935
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1936
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1937
                $mountpage = $queryBuilder
1938
                    ->select('*')
1939
                    ->from('pages')
1940
                    ->where(
1941
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1942
                    )
1943
                    ->execute()
1944
                    ->fetchAll();
1945
                $queryBuilder->getRestrictions()->reset();
1946
1947
                // fetch mounted pages
1948
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1949
1950
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1951
                $mountTree->init('AND ' . $perms_clause);
1952
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1953
1954
                foreach ($mountTree->tree as $mountData) {
1955
                    $code .= $this->drawURLs_addRowsForPage(
1956
                        $mountData['row'],
1957
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1958
                    );
1959
                }
1960
1961
                // replace page when mount_pid_ol is enabled
1962
                if ($mountpage[0]['mount_pid_ol']) {
1963
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1964
                } else {
1965
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1966
                    $this->MP = false;
1967
                }
1968
            }
1969
1970
            $code .= $this->drawURLs_addRowsForPage(
1971
                $data['row'],
1972
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1973
            );
1974
        }
1975
1976
        return $code;
1977
    }
1978
1979
    /**
1980
     * Expands exclude string
1981
     *
1982
     * @param string $excludeString Exclude string
1983
     * @return array
1984
     */
1985 1
    public function expandExcludeString($excludeString)
1986
    {
1987
        // internal static caches;
1988 1
        static $expandedExcludeStringCache;
1989 1
        static $treeCache;
1990
1991 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1992 1
            $pidList = [];
1993
1994 1
            if (!empty($excludeString)) {
1995
                /** @var PageTreeView $tree */
1996
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1997
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1998
1999
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2000
2001
                foreach ($excludeParts as $excludePart) {
2002
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2003
2004
                    // default is "page only" = "depth=0"
2005
                    if (empty($depth)) {
2006
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2007
                    }
2008
2009
                    $pidList[] = $pid;
2010
2011
                    if ($depth > 0) {
2012
                        if (empty($treeCache[$pid][$depth])) {
2013
                            $tree->reset();
2014
                            $tree->getTree($pid, $depth);
2015
                            $treeCache[$pid][$depth] = $tree->tree;
2016
                        }
2017
2018
                        foreach ($treeCache[$pid][$depth] as $data) {
2019
                            $pidList[] = $data['row']['uid'];
2020
                        }
2021
                    }
2022
                }
2023
            }
2024
2025 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2026
        }
2027
2028 1
        return $expandedExcludeStringCache[$excludeString];
2029
    }
2030
2031
    /**
2032
     * Create the rows for display of the page tree
2033
     * For each page a number of rows are shown displaying GET variable configuration
2034
     *
2035
     * @param    array        Page row
2036
     * @param    string        Page icon and title for row
2037
     * @return    string        HTML <tr> content (one or more)
2038
     */
2039
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2040
    {
2041
        $skipMessage = '';
2042
2043
        // Get list of configurations
2044
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2045
2046
        if (count($this->incomingConfigurationSelection) > 0) {
2047
            // remove configuration that does not match the current selection
2048
            foreach ($configurations as $confKey => $confArray) {
2049
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2050
                    unset($configurations[$confKey]);
2051
                }
2052
            }
2053
        }
2054
2055
        // Traverse parameter combinations:
2056
        $c = 0;
2057
        $content = '';
2058
        if (count($configurations)) {
2059
            foreach ($configurations as $confKey => $confArray) {
2060
2061
                    // Title column:
2062
                if (!$c) {
2063
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2064
                } else {
2065
                    $titleClm = '';
2066
                }
2067
2068
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2069
2070
                        // URL list:
2071
                    $urlList = $this->urlListFromUrlArray(
2072
                        $confArray,
2073
                        $pageRow,
2074
                        $this->scheduledTime,
2075
                        $this->reqMinute,
2076
                        $this->submitCrawlUrls,
2077
                        $this->downloadCrawlUrls,
2078
                        $this->duplicateTrack,
2079
                        $this->downloadUrls,
2080
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2081
                    );
2082
2083
                    // Expanded parameters:
2084
                    $paramExpanded = '';
2085
                    $calcAccu = [];
2086
                    $calcRes = 1;
2087
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2088
                        $paramExpanded .= '
2089
                            <tr>
2090
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2091
                                                '(' . count($gVal) . ')' .
2092
                                                '</td>
2093
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2094
                            </tr>
2095
                        ';
2096
                        $calcRes *= count($gVal);
2097
                        $calcAccu[] = count($gVal);
2098
                    }
2099
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2100
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2101
2102
                    // Options
2103
                    $optionValues = '';
2104
                    if ($confArray['subCfg']['userGroups']) {
2105
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2106
                    }
2107
                    if ($confArray['subCfg']['baseUrl']) {
2108
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2109
                    }
2110
                    if ($confArray['subCfg']['procInstrFilter']) {
2111
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2112
                    }
2113
2114
                    // Compile row:
2115
                    $content .= '
2116
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2117
                            ' . $titleClm . '
2118
                            <td>' . htmlspecialchars($confKey) . '</td>
2119
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2120
                            <td>' . $paramExpanded . '</td>
2121
                            <td nowrap="nowrap">' . $urlList . '</td>
2122
                            <td nowrap="nowrap">' . $optionValues . '</td>
2123
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2124
                        </tr>';
2125
                } else {
2126
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2127
                            ' . $titleClm . '
2128
                            <td>' . htmlspecialchars($confKey) . '</td>
2129
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2130
                        </tr>';
2131
                }
2132
2133
                $c++;
2134
            }
2135
        } else {
2136
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2137
2138
            // Compile row:
2139
            $content .= '
2140
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2141
                    <td>' . $pageTitleAndIcon . '</td>
2142
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2143
                </tr>';
2144
        }
2145
2146
        return $content;
2147
    }
2148
2149
    /*****************************
2150
     *
2151
     * CLI functions
2152
     *
2153
     *****************************/
2154
2155
    /**
2156
     * Helper function
2157
     *
2158
     * @param string $option Option string, eg. "-s
2159
     * @param int $idx Value index, default is 0 (zero) = the first one...
2160
     * @return string
2161
     */
2162
    private function cli_argValue($option, $idx)
2163
    {
2164
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
2165
    }
2166
2167
    /**
2168
     * Helper function
2169
     *
2170
     * @param string $string The string to output
2171
     */
2172
    private function cli_echo($string)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2173
    {
2174
        $this->outputLine($string);
2175
    }
2176
2177
    /**
2178
     * Set cli args
2179
     *
2180
     * This is a copy from the CommandLineController from TYPO3 < v9
2181
     *
2182
     * TODO: Rework
2183
     *
2184
     * @param array $argv
2185
     */
2186
    private function setCliArgs(array $argv)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2187
    {
2188
        $cli_options = [];
2189
        $index = '_DEFAULT';
2190
        foreach ($argv as $token) {
2191
            // Options starting with a number is invalid - they could be negative values!
2192
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2193
                list($index, $opt) = explode('=', $token, 2);
2194
                if (isset($cli_options[$index])) {
2195
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2196
                    die;
2197
                }
2198
                $cli_options[$index] = [];
2199
                if (isset($opt)) {
2200
                    $cli_options[$index][] = $opt;
2201
                }
2202
            } else {
2203
                $cli_options[$index][] = $token;
2204
            }
2205
        }
2206
2207
        $this->cliArgs = $cli_options;
2208
    }
2209
2210
    /**
2211
     * Obtains configuration keys from the CLI arguments
2212
     *
2213
     * @return mixed                        Array of keys or null if no keys found
2214
     */
2215
    protected function getConfigurationKeys()
2216
    {
2217
        $parameter = trim($this->cli_argValue('-conf'));
2218
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2219
    }
2220
2221
    /**
2222
     * Running the functionality of the CLI (crawling URLs from queue)
2223
     *
2224
     * @param int $countInARun
2225
     * @param int $sleepTime
2226
     * @param int $sleepAfterFinish
2227
     * @return string
2228
     */
2229
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2230
    {
2231
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2232
        $result = 0;
2233
        $counter = 0;
2234
2235
        // First, run hooks:
2236
        $this->CLI_runHooks();
2237
2238
        // Clean up the queue
2239
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2240
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2241
2242
            $del = $queryBuilder
2243
                ->delete($this->tableName)
2244
                ->where(
2245
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2246
                );
2247
            if (false == $del) {
2248
                $this->logger->info(
2249
                    'Records could not be deleted.'
2250
                );
2251
            }
2252
        }
2253
2254
        // Select entries:
2255
        //TODO Shouldn't this reside within the transaction?
2256
        $rows = $queryBuilder
2257
            ->select('qid', 'scheduled')
2258
            ->from('tx_crawler_queue')
2259
            ->where(
2260
                $queryBuilder->expr()->eq('exec_time', 0),
2261
                $queryBuilder->expr()->eq('process_scheduled', 0),
2262
                $queryBuilder->expr()->lte('scheduled', $this->getCurrentTime())
2263
            )
2264
            ->orderBy('scheduled')
2265
            ->addOrderBy('qid')
2266
            ->setMaxResults($countInARun)
2267
            ->execute()
2268
            ->fetchAll();
2269
2270
        if (count($rows) > 0) {
2271
            $quidList = [];
2272
2273
            foreach ($rows as $r) {
2274
                $quidList[] = $r['qid'];
2275
            }
2276
2277
            $processId = $this->CLI_buildProcessId();
2278
2279
            //reserve queue entries for process
2280
2281
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2282
            //TODO make sure we're not taking assigned queue-entires
2283
2284
            //save the number of assigned queue entrys to determine who many have been processed later
2285
            $numberOfAffectedRows = $queryBuilder
2286
                ->update('tx_crawler_queue')
2287
                ->where(
2288
                    $queryBuilder->expr()->in('qid', $quidList)
2289
                )
2290
                ->set('process_scheduled', $queryBuilder->createNamedParameter($this->getCurrentTime(), \PDO::PARAM_INT))
2291
                ->set('process_id', $processId)
2292
                ->execute();
2293
2294
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2295
                ->update(
2296
                    'tx_crawler_process',
2297
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2298
                    [ 'process_id' => (int) $processId ]
2299
                );
2300
2301
            if ($numberOfAffectedRows == count($quidList)) {
2302
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2303
            } else {
2304
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2305
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2306
                return ($result | self::CLI_STATUS_ABORTED);
2307
            }
2308
2309
            foreach ($rows as $r) {
2310
                $result |= $this->readUrl($r['qid']);
2311
2312
                $counter++;
2313
                usleep(intval($sleepTime)); // Just to relax the system
2314
2315
                // if during the start and the current read url the cli has been disable we need to return from the function
2316
                // mark the process NOT as ended.
2317
                if ($this->getDisabled()) {
2318
                    return ($result | self::CLI_STATUS_ABORTED);
2319
                }
2320
2321
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2322
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2323
2324
                    //TODO might need an additional returncode
2325
                    $result |= self::CLI_STATUS_ABORTED;
2326
                    break; //possible timeout
2327
                }
2328
            }
2329
2330
            sleep(intval($sleepAfterFinish));
2331
2332
            $msg = 'Rows: ' . $counter;
2333
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2334
        } else {
2335
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2336
        }
2337
2338
        if ($counter > 0) {
2339
            $result |= self::CLI_STATUS_PROCESSED;
2340
        }
2341
2342
        return $result;
2343
    }
2344
2345
    /**
2346
     * Activate hooks
2347
     *
2348
     * @return void
2349
     */
2350
    public function CLI_runHooks()
2351
    {
2352
        global $TYPO3_CONF_VARS;
2353
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2354
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2355
                $hookObj = GeneralUtility::makeInstance($objRef);
2356
                if (is_object($hookObj)) {
2357
                    $hookObj->crawler_init($this);
2358
                }
2359
            }
2360
        }
2361
    }
2362
2363
    /**
2364
     * Try to acquire a new process with the given id
2365
     * also performs some auto-cleanup for orphan processes
2366
     * @todo preemption might not be the most elegant way to clean up
2367
     *
2368
     * @param string $id identification string for the process
2369
     * @return boolean
2370
     */
2371
    public function CLI_checkAndAcquireNewProcess($id)
2372
    {
2373
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2374
        $ret = true;
2375
2376
        $systemProcessId = getmypid();
2377
        if ($systemProcessId < 1) {
2378
            return false;
2379
        }
2380
2381
        $processCount = 0;
2382
        $orphanProcesses = [];
2383
2384
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2385
2386
        $statement = $queryBuilder
2387
            ->select('process_id', 'ttl')
2388
            ->from('tx_crawler_process')
2389
            ->where(
2390
                'active = 1 AND deleted = 0'
2391
            )
2392
            ->execute();
2393
2394
        $currentTime = $this->getCurrentTime();
2395
2396
        while ($row = $statement->fetch()) {
2397
            if ($row['ttl'] < $currentTime) {
2398
                $orphanProcesses[] = $row['process_id'];
2399
            } else {
2400
                $processCount++;
2401
            }
2402
        }
2403
2404
        // if there are less than allowed active processes then add a new one
2405
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2406
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2407
2408
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2409
                'tx_crawler_process',
2410
                [
2411
                    'process_id' => $id,
2412
                    'active' => 1,
2413
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2414
                    'system_process_id' => $systemProcessId
2415
                ]
2416
            );
2417
        } else {
2418
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2419
            $ret = false;
2420
        }
2421
2422
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2423
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2424
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2425
2426
        return $ret;
2427
    }
2428
2429
    /**
2430
     * Release a process and the required resources
2431
     *
2432
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2433
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2434
     * @return boolean
2435
     */
2436
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2437
    {
2438
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2439
2440
        if (!is_array($releaseIds)) {
2441
            $releaseIds = [$releaseIds];
2442
        }
2443
2444
        if (!(count($releaseIds) > 0)) {
2445
            return false;   //nothing to release
2446
        }
2447
2448
        if (!$withinLock) {
2449
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2450
        }
2451
2452
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2453
        // this ensures that a single process can't mess up the entire process table
2454
2455
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2456
2457
        $queryBuilder
2458
        ->update('tx_crawler_queue', 'q')
2459
        ->where(
2460
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2461
        )
2462
        ->set('q.process_scheduled', 0)
2463
        ->set('q.process_id', '')
2464
        ->execute();
2465
2466
        // FIXME: Not entirely sure that this is equivalent to the previous version
2467
        $queryBuilder->resetQueryPart('set');
2468
2469
        $queryBuilder
2470
            ->update('tx_crawler_process')
2471
            ->where(
2472
                $queryBuilder->expr()->eq('active', 0),
2473
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2474
            )
2475
            ->set('system_process_id', 0)
2476
            ->execute();
2477
        // previous version for reference
2478
        /*
2479
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2480
            'tx_crawler_process',
2481
            'active=0 AND deleted=0
2482
            AND NOT EXISTS (
2483
                SELECT * FROM tx_crawler_queue
2484
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2485
                AND tx_crawler_queue.exec_time = 0
2486
            )',
2487
            [
2488
                'deleted' => '1',
2489
                'system_process_id' => 0
2490
            ]
2491
        );*/
2492
        // mark all requested processes as non-active
2493
        $queryBuilder
2494
            ->update('tx_crawler_process')
2495
            ->where(
2496
                'NOT EXISTS (
2497
                SELECT * FROM tx_crawler_queue
2498
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2499
                    AND tx_crawler_queue.exec_time = 0
2500
                )',
2501
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2502
                $queryBuilder->expr()->eq('deleted', 0)
2503
            )
2504
            ->set('active', 0)
2505
            ->execute();
2506
        $queryBuilder->resetQueryPart('set');
2507
        $queryBuilder
2508
            ->update('tx_crawler_queue')
2509
            ->where(
2510
                $queryBuilder->expr()->eq('exec_time', 0),
2511
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2512
            )
2513
            ->set('process_scheduled', 0)
2514
            ->set('process_id', '')
2515
            ->execute();
2516
2517
        if (!$withinLock) {
2518
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2519
        }
2520
2521
        return true;
2522
    }
2523
2524
    /**
2525
     * Check if there are still resources left for the process with the given id
2526
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2527
     *
2528
     * @param  string  identification string for the process
2529
     * @return boolean determines if the process is still active / has resources
2530
     *
2531
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2532
     */
2533 1
    public function CLI_checkIfProcessIsActive($pid)
2534
    {
2535 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2536 1
        $ret = false;
2537
2538
        $statement = $queryBuilder
2539 1
            ->from('tx_crawler_process')
2540 1
            ->select('active')
2541 1
            ->where(
2542 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2543
            )
2544 1
            ->orderBy('ttl')
2545 1
            ->execute();
2546
2547 1
        if ($row = $statement->fetch(0)) {
2548 1
            $ret = intVal($row['active']) == 1;
2549
        }
2550
2551 1
        return $ret;
2552
    }
2553
2554
    /**
2555
     * Create a unique Id for the current process
2556
     *
2557
     * @return string  the ID
2558
     */
2559 2
    public function CLI_buildProcessId()
2560
    {
2561 2
        if (!$this->processID) {
2562 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2563
        }
2564 2
        return $this->processID;
2565
    }
2566
2567
    /**
2568
     * @param bool $get_as_float
2569
     *
2570
     * @return mixed
2571
     */
2572
    protected function microtime($get_as_float = false)
2573
    {
2574
        return microtime($get_as_float);
2575
    }
2576
2577
    /**
2578
     * Prints a message to the stdout (only if debug-mode is enabled)
2579
     *
2580
     * @param  string $msg  the message
2581
     */
2582
    public function CLI_debug($msg)
2583
    {
2584
        if (intval($this->extensionSettings['processDebug'])) {
2585
            echo $msg . "\n";
2586
            flush();
2587
        }
2588
    }
2589
2590
    /**
2591
     * Get URL content by making direct request to TYPO3.
2592
     *
2593
     * @param  string $url          Page URL
2594
     * @param  int    $crawlerId    Crawler-ID
2595
     * @return array
2596
     */
2597 2
    protected function sendDirectRequest($url, $crawlerId)
2598
    {
2599 2
        $parsedUrl = parse_url($url);
2600 2
        if (!is_array($parsedUrl)) {
2601
            return [];
2602
        }
2603
2604 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2605
2606 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2607 2
        $cmd .= ' ';
2608 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2609 2
        $cmd .= ' ';
2610 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2611 2
        $cmd .= ' ';
2612 2
        $cmd .= escapeshellarg($url);
2613 2
        $cmd .= ' ';
2614 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2615
2616 2
        $startTime = microtime(true);
2617 2
        $content = $this->executeShellCommand($cmd);
2618 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2619
2620
        $result = [
2621 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2622 2
            'headers' => '',
2623 2
            'content' => $content
2624
        ];
2625
2626 2
        return $result;
2627
    }
2628
2629
    /**
2630
     * Cleans up entries that stayed for too long in the queue. These are:
2631
     * - processed entries that are over 1.5 days in age
2632
     * - scheduled entries that are over 7 days old
2633
     *
2634
     * @return void
2635
     */
2636
    public function cleanUpOldQueueEntries()
2637
    {
2638
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2639
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2640
2641
        $now = time();
2642
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2643
        $this->flushQueue($condition);
2644
    }
2645
2646
    /**
2647
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2648
     *
2649
     * @param int $id
2650
     * @param int $typeNum
2651
     *
2652
     * @return void
2653
     */
2654
    protected function initTSFE($id = 1, $typeNum = 0)
2655
    {
2656
        EidUtility::initTCA();
2657
        if (!is_object($GLOBALS['TT'])) {
2658
            $GLOBALS['TT'] = new TimeTracker(false);
2659
            $GLOBALS['TT']->start();
2660
        }
2661
2662
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2663
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2664
        $GLOBALS['TSFE']->sys_page->init(true);
2665
        $GLOBALS['TSFE']->initFEuser();
2666
        $GLOBALS['TSFE']->determineId();
2667
        $GLOBALS['TSFE']->initTemplate();
2668
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2669
        $GLOBALS['TSFE']->getConfigArray();
2670
    }
2671
2672
    /**
2673
     * Returns a md5 hash generated from a serialized configuration array.
2674
     *
2675
     * @param array $configuration
2676
     *
2677
     * @return string
2678
     */
2679 7
    protected function getConfigurationHash(array $configuration)
2680
    {
2681 7
        unset($configuration['paramExpanded']);
2682 7
        unset($configuration['URLs']);
2683 7
        return md5(serialize($configuration));
2684
    }
2685
2686
    /**
2687
     * Check whether the Crawling Protocol should be http or https
2688
     *
2689
     * @param $crawlerConfiguration
2690
     * @param $pageConfiguration
2691
     *
2692
     * @return bool
2693
     */
2694 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration)
2695
    {
2696 6
        switch ($crawlerConfiguration) {
2697
            case -1:
2698 1
                return false;
2699 5
            case 0:
2700 3
                return $pageConfiguration;
2701 2
            case 1:
2702 1
                return true;
2703
            default:
2704 1
                return false;
2705
        }
2706
    }
2707
}
2708