Completed
Push — typo3v9 ( 331de1...a4b47d )
by Tomas Norre
23:42 queued 22:06
created

CrawlerController::cli_echo()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 1
dl 0
loc 4
ccs 0
cts 3
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use Psr\Log\LoggerAwareInterface;
35
use Psr\Log\LoggerAwareTrait;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Core\Environment;
40
use TYPO3\CMS\Core\Database\Connection;
41
use TYPO3\CMS\Core\Database\ConnectionPool;
42
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
43
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
44
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
45
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
46
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
47
use TYPO3\CMS\Core\Utility\DebugUtility;
48
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
49
use TYPO3\CMS\Core\Utility\GeneralUtility;
50
use TYPO3\CMS\Core\Utility\MathUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController implements LoggerAwareInterface
61
{
62
    use LoggerAwareTrait;
63
64
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
65
    const CLI_STATUS_REMAIN = 1; //queue not empty
66
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
67
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
68
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
69
70
    /**
71
     * @var integer
72
     */
73
    public $setID = 0;
74
75
    /**
76
     * @var string
77
     */
78
    public $processID = '';
79
80
    /**
81
     * One hour is max stalled time for the CLI
82
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
83
     *
84
     * @var integer
85
     */
86
    public $max_CLI_exec_time = 3600;
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var boolean
132
     */
133
    public $MP = false;
134
135
    /**
136
     * @var string
137
     */
138
    protected $processFilename;
139
140
    /**
141
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
142
     *
143
     * @var string
144
     */
145
    protected $accessMode;
146
147
    /**
148
     * @var BackendUserAuthentication
149
     */
150
    private $backendUser;
151
152
    /**
153
     * @var integer
154
     */
155
    private $scheduledTime = 0;
156
157
    /**
158
     * @var integer
159
     */
160
    private $reqMinute = 0;
161
162
    /**
163
     * @var bool
164
     */
165
    private $submitCrawlUrls = false;
166
167
    /**
168
     * @var bool
169
     */
170
    private $downloadCrawlUrls = false;
171
172
    /**
173
     * @var QueueRepository
174
     */
175
    protected $queueRepository;
176
177
    /**
178
     * @var ProcessRepository
179
     */
180
    protected $processRepository;
181
182
    /**
183
     * @var string
184
     */
185
    protected $tableName = 'tx_crawler_queue';
186
187
    /**
188
     * Method to set the accessMode can be gui, cli or cli_im
189
     *
190
     * @return string
191
     */
192 1
    public function getAccessMode()
193
    {
194 1
        return $this->accessMode;
195
    }
196
197
    /**
198
     * @param string $accessMode
199
     */
200 1
    public function setAccessMode($accessMode)
201
    {
202 1
        $this->accessMode = $accessMode;
203 1
    }
204
205
    /**
206
     * Set disabled status to prevent processes from being processed
207
     *
208
     * @param  bool $disabled (optional, defaults to true)
209
     * @return void
210
     */
211 3
    public function setDisabled($disabled = true)
212
    {
213 3
        if ($disabled) {
214 2
            GeneralUtility::writeFile($this->processFilename, '');
215
        } else {
216 1
            if (is_file($this->processFilename)) {
217 1
                unlink($this->processFilename);
218
            }
219
        }
220 3
    }
221
222
    /**
223
     * Get disable status
224
     *
225
     * @return bool true if disabled
226
     */
227 3
    public function getDisabled()
228
    {
229 3
        return is_file($this->processFilename);
230
    }
231
232
    /**
233
     * @param string $filenameWithPath
234
     *
235
     * @return void
236
     */
237 4
    public function setProcessFilename($filenameWithPath)
238
    {
239 4
        $this->processFilename = $filenameWithPath;
240 4
    }
241
242
    /**
243
     * @return string
244
     */
245 1
    public function getProcessFilename()
246
    {
247 1
        return $this->processFilename;
248
    }
249
250
    /************************************
251
     *
252
     * Getting URLs based on Page TSconfig
253
     *
254
     ************************************/
255
256 31
    public function __construct()
257
    {
258 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
259 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
260 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
261
262 31
        $this->backendUser = $GLOBALS['BE_USER'];
263 31
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
264
265
        /** @var ExtensionConfigurationProvider $configurationProvider */
266 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
267 31
        $settings = $configurationProvider->getExtensionConfiguration();
268 31
        $settings = is_array($settings) ? $settings : [];
269
270
        // read ext_em_conf_template settings and set
271 31
        $this->setExtensionSettings($settings);
272
273
        // set defaults:
274 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
275
            $this->extensionSettings['countInARun'] = 100;
276
        }
277
278 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
279 31
    }
280
281
    /**
282
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
283
     *
284
     * @param array $extensionSettings
285
     * @return void
286
     */
287 40
    public function setExtensionSettings(array $extensionSettings)
288
    {
289 40
        $this->extensionSettings = $extensionSettings;
290 40
    }
291
292
    /**
293
     * Check if the given page should be crawled
294
     *
295
     * @param array $pageRow
296
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
297
     */
298 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
299
    {
300 8
        $skipPage = false;
301 8
        $skipMessage = 'Skipped'; // message will be overwritten later
302
303
        // if page is hidden
304 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
305 8
            if ($pageRow['hidden']) {
306 1
                $skipPage = true;
307 1
                $skipMessage = 'Because page is hidden';
308
            }
309
        }
310
311 8
        if (!$skipPage) {
312 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
313 3
                $skipPage = true;
314 3
                $skipMessage = 'Because doktype is not allowed';
315
            }
316
        }
317
318 8
        if (!$skipPage) {
319 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
320 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
321 1
                    $skipPage = true;
322 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
323 1
                    break;
324
                }
325
            }
326
        }
327
328 8
        if (!$skipPage) {
329
            // veto hook
330 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
331
                $params = [
332
                    'pageRow' => $pageRow
333
                ];
334
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
335
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
336
                if ($veto !== false) {
337
                    $skipPage = true;
338
                    if (is_string($veto)) {
339
                        $skipMessage = $veto;
340
                    } else {
341
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
342
                    }
343
                    // no need to execute other hooks if a previous one return a veto
344
                    break;
345
                }
346
            }
347
        }
348
349 8
        return $skipPage ? $skipMessage : false;
350
    }
351
352
    /**
353
     * Wrapper method for getUrlsForPageId()
354
     * It returns an array of configurations and no urls!
355
     *
356
     * @param array $pageRow Page record with at least dok-type and uid columns.
357
     * @param string $skipMessage
358
     * @return array
359
     * @see getUrlsForPageId()
360
     */
361 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
362
    {
363 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
364
365 4
        if ($message === false) {
366 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
367 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
368 3
            $skipMessage = '';
369
        } else {
370 1
            $skipMessage = $message;
371 1
            $res = [];
372
        }
373
374 4
        return $res;
375
    }
376
377
    /**
378
     * This method is used to count if there are ANY unprocessed queue entries
379
     * of a given page_id and the configuration which matches a given hash.
380
     * If there if none, we can skip an inner detail check
381
     *
382
     * @param  int $uid
383
     * @param  string $configurationHash
384
     * @return boolean
385
     */
386 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
387
    {
388 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
389 5
        $noUnprocessedQueueEntriesFound = true;
390
391
        $result = $queryBuilder
392 5
            ->count('*')
393 5
            ->from($this->tableName)
394 5
            ->where(
395 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
396 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
397 5
                $queryBuilder->expr()->eq('exec_time', 0)
398
            )
399 5
            ->execute()
400 5
            ->fetchColumn();
401
402 5
        if ($result) {
403 3
            $noUnprocessedQueueEntriesFound = false;
404
        }
405
406 5
        return $noUnprocessedQueueEntriesFound;
407
    }
408
409
    /**
410
     * Creates a list of URLs from input array (and submits them to queue if asked for)
411
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
412
     *
413
     * @param    array        Information about URLs from pageRow to crawl.
414
     * @param    array        Page row
415
     * @param    integer        Unix time to schedule indexing to, typically time()
416
     * @param    integer        Number of requests per minute (creates the interleave between requests)
417
     * @param    boolean        If set, submits the URLs to queue
418
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
419
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
420
     * @param    array        Array which will be filled with URLS for download if flag is set.
421
     * @param    array        Array of processing instructions
422
     * @return    string        List of URLs (meant for display in backend module)
423
     *
424
     */
425 2
    public function urlListFromUrlArray(
426
        array $vv,
427
        array $pageRow,
428
        $scheduledTime,
429
        $reqMinute,
430
        $submitCrawlUrls,
431
        $downloadCrawlUrls,
432
        array &$duplicateTrack,
433
        array &$downloadUrls,
434
        array $incomingProcInstructions
435
    ) {
436 2
        $urlList = '';
437
438 2
        if (is_array($vv['URLs'])) {
439 2
            $configurationHash = $this->getConfigurationHash($vv);
440 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
441
442 2
            foreach ($vv['URLs'] as $urlQuery) {
443 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
444
445
                    // Calculate cHash:
446 2
                    if ($vv['subCfg']['cHash']) {
447
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
448
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
449
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
450
                    }
451
452
                    // Create key by which to determine unique-ness:
453 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
454 2
                    $urlQuery = 'index.php' . $urlQuery;
455
456
                    // Scheduled time:
457 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
458 2
                    $schTime = floor($schTime / 60) * 60;
459
460 2
                    if (isset($duplicateTrack[$uKey])) {
461
462
                        //if the url key is registered just display it and do not resubmit is
463
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
464
                    } else {
465 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
466 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
467
468 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
469
470
                        // Submit for crawling!
471 2
                        if ($submitCrawlUrls) {
472 2
                            $added = $this->addUrl(
473 2
                                $pageRow['uid'],
474 2
                                $theUrl,
475 2
                                $vv['subCfg'],
476 2
                                $scheduledTime,
477 2
                                $configurationHash,
478 2
                                $skipInnerCheck
479
                            );
480 2
                            if ($added === false) {
481 2
                                $urlList .= ' (Url already existed)';
482
                            }
483
                        } elseif ($downloadCrawlUrls) {
484
                            $downloadUrls[$theUrl] = $theUrl;
485
                        }
486
487 2
                        $urlList .= '<br />';
488
                    }
489 2
                    $duplicateTrack[$uKey] = true;
490
                }
491
            }
492
        } else {
493
            $urlList = 'ERROR - no URL generated';
494
        }
495
496 2
        return $urlList;
497
    }
498
499
    /**
500
     * Returns true if input processing instruction is among registered ones.
501
     *
502
     * @param string $piString PI to test
503
     * @param array $incomingProcInstructions Processing instructions
504
     * @return boolean
505
     */
506 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
507
    {
508 5
        if (empty($incomingProcInstructions)) {
509 1
            return true;
510
        }
511
512 4
        foreach ($incomingProcInstructions as $pi) {
513 4
            if (GeneralUtility::inList($piString, $pi)) {
514 2
                return true;
515
            }
516
        }
517 2
        return false;
518
    }
519
520 2
    public function getPageTSconfigForId($id)
521
    {
522 2
        if (!$this->MP) {
523 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
524
        } else {
525
            list(, $mountPointId) = explode('-', $this->MP);
526
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
527
        }
528
529
        // Call a hook to alter configuration
530 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
531
            $params = [
532
                'pageId' => $id,
533
                'pageTSConfig' => &$pageTSconfig
534
            ];
535
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
536
                GeneralUtility::callUserFunction($userFunc, $params, $this);
537
            }
538
        }
539
540 2
        return $pageTSconfig;
541
    }
542
543
    /**
544
     * This methods returns an array of configurations.
545
     * And no urls!
546
     *
547
     * @param integer $id Page ID
548
     * @param bool $forceSsl Use https
549
     * @return array
550
     */
551 2
    public function getUrlsForPageId($id, $forceSsl = false)
552
    {
553
554
        /**
555
         * Get configuration from tsConfig
556
         */
557
558
        // Get page TSconfig for page ID:
559 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
560
561 2
        $res = [];
562
563 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
564 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
565
566 1
            if (is_array($crawlerCfg['paramSets.'])) {
567 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
568 1
                    if (is_array($values)) {
569 1
                        $key = str_replace('.', '', $key);
570
                        // Sub configuration for a single configuration string:
571 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
572 1
                        $subCfg['key'] = $key;
573
574 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
575 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
576
                        }
577 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
578
579
                        // process configuration if it is not page-specific or if the specific page is the current page:
580 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
581
582
                                // add trailing slash if not present
583 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
584
                                $subCfg['baseUrl'] .= '/';
585
                            }
586
587
                            // Explode, process etc.:
588 1
                            $res[$key] = [];
589 1
                            $res[$key]['subCfg'] = $subCfg;
590 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
591 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
592 1
                            $res[$key]['origin'] = 'pagets';
593
594
                            // recognize MP value
595 1
                            if (!$this->MP) {
596 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
597
                            } else {
598
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
599
                            }
600
                        }
601
                    }
602
                }
603
            }
604
        }
605
606
        /**
607
         * Get configuration from tx_crawler_configuration records
608
         */
609
610
        // get records along the rootline
611 2
        $rootLine = BackendUtility::BEgetRootLine($id);
612
613 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
614
        $queryBuilder
615 2
            ->getRestrictions()->removeAll()
616 2
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
617 2
            ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
618
619 2
        foreach ($rootLine as $page) {
620
            $configurationRecordsForCurrentPage = $queryBuilder
621 2
                ->select('*')
622 2
                ->from('tx_crawler_configuration')
623 2
                ->where(
624 2
                    $queryBuilder->expr()->eq('pid', $page['uid'])
625
                )
626 2
                ->execute()
627 2
                ->fetchAll();
628
629 2
            foreach ($configurationRecordsForCurrentPage ?? [] as $configurationRecord) {
630
631
                    // check access to the configuration record
632 1
                if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
633 1
                    $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
634
635
                    // process configuration if it is not page-specific or if the specific page is the current page:
636 1
                    if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
637 1
                        $key = $configurationRecord['name'];
638
639
                        // don't overwrite previously defined paramSets
640 1
                        if (!isset($res[$key])) {
641
642
                                /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
643 1
                            $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
644 1
                            $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
645
646 1
                            $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
647
648
                            $subCfg = [
649 1
                                'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
650 1
                                'procInstrParams.' => $TSparserObject->setup,
651 1
                                'baseUrl' => $this->getBaseUrlForConfigurationRecord(
652 1
                                    $configurationRecord['base_url'],
653 1
                                    $configurationRecord['sys_domain_base_url'],
654 1
                                    $isCrawlingProtocolHttps
655
                                ),
656 1
                                'cHash' => $configurationRecord['chash'],
657 1
                                'userGroups' => $configurationRecord['fegroups'],
658 1
                                'exclude' => $configurationRecord['exclude'],
659 1
                                'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
660 1
                                'key' => $key
661
                            ];
662
663
                            // add trailing slash if not present
664 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
665
                                $subCfg['baseUrl'] .= '/';
666
                            }
667 1
                            if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
668 1
                                $res[$key] = [];
669 1
                                $res[$key]['subCfg'] = $subCfg;
670 1
                                $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
671 1
                                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
672 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
673 1
                                $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
674
                            }
675
                        }
676
                    }
677
                }
678
            }
679
        }
680
681 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
682
            $params = [
683
                'res' => &$res,
684
            ];
685
            GeneralUtility::callUserFunction($func, $params, $this);
686
        }
687 2
        return $res;
688
    }
689
690
    /**
691
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
692
     *
693
     * @param string $baseUrl
694
     * @param integer $sysDomainUid
695
     * @param bool $ssl
696
     * @return string
697
     */
698 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
699
    {
700 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
701 4
        $sysDomainUid = intval($sysDomainUid);
702 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
703
704 4
        if ($sysDomainUid > 0) {
705
            $statement = $queryBuilder
706 2
                ->from('sys_domain')
707 2
                ->select('*')
708 2
                ->where(
709 2
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
710
                )
711 2
                ->execute();
712
713 2
            $row = $statement->fetch(0);
714 2
            if ($row['domainName'] != '') {
715 1
                return $urlScheme . '://' . $row['domainName'];
716
            }
717
        }
718 3
        return $baseUrl;
719
    }
720
721
    /**
722
     * @param $rootid
723
     * @param $depth
724
     * @return array
725
     *
726
     * TODO: Write Functional Tests
727
     */
728
    public function getConfigurationsForBranch($rootid, $depth)
729
    {
730
        $configurationsForBranch = [];
731
732
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
733
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
734
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
735
            if (is_array($sets)) {
736
                foreach ($sets as $key => $value) {
737
                    if (!is_array($value)) {
738
                        continue;
739
                    }
740
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
741
                }
742
            }
743
        }
744
        $pids = [];
745
        $rootLine = BackendUtility::BEgetRootLine($rootid);
746
        foreach ($rootLine as $node) {
747
            $pids[] = $node['uid'];
748
        }
749
        /* @var PageTreeView $tree */
750
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
751
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
752
        $tree->init('AND ' . $perms_clause);
753
        $tree->getTree($rootid, $depth, '');
754
        foreach ($tree->tree as $node) {
755
            $pids[] = $node['row']['uid'];
756
        }
757
758
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
759
760
        $queryBuilder->getRestrictions()
761
            ->removeAll()
762
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
763
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
764
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
765
766
        $statement = $queryBuilder
767
            ->select('name')
768
            ->from('tx_crawler_configuration')
769
            ->where(
770
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
771
            )
772
        ->execute();
773
774
        while ($row = $statement->fetch()) {
775
            $configurationsForBranch[] = $row['name'];
776
        }
777
778
        return $configurationsForBranch;
779
    }
780
781
    /**
782
     * Get querybuilder for given table
783
     *
784
     * @param string $table
785
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
786
     */
787 9
    private function getQueryBuilder(string $table)
788
    {
789 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
790 9
            ->getConnectionForTable($table)
791 9
            ->createQueryBuilder();
792
    }
793
794
    /**
795
     * Check if a user has access to an item
796
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
797
     *
798
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
799
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
800
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
801
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
802
     */
803 3
    public function hasGroupAccess($groupList, $accessList)
804
    {
805 3
        if (empty($accessList)) {
806 1
            return true;
807
        }
808 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
809 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
810 1
                return true;
811
            }
812
        }
813 1
        return false;
814
    }
815
816
    /**
817
     * Parse GET vars of input Query into array with key=>value pairs
818
     *
819
     * @param string $inputQuery Input query string
820
     * @return array
821
     */
822 5
    public function parseParams($inputQuery)
823
    {
824
        //echo '<pre>', var_dump($inputQuery), '</pre>';
825
        // Extract all GET parameters into an ARRAY:
826 5
        $paramKeyValues = [];
827 5
        $GETparams = explode('&', $inputQuery);
828
829 5
        foreach ($GETparams as $paramAndValue) {
830 5
            list($p, $v) = explode('=', $paramAndValue, 2);
831 5
            if (strlen($p)) {
832 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
833
            }
834
        }
835
836 5
        return $paramKeyValues;
837
    }
838
839
    /**
840
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
841
     * Syntax of values:
842
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
843
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
844
     * - For each configuration part:
845
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
846
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
847
     *        _ENABLELANG:1 picks only original records without their language overlays
848
     *         - Default: Literal value
849
     *
850
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
851
     * @param integer $pid Current page ID
852
     * @return array
853
     *
854
     * TODO: Write Functional Tests
855
     */
856 2
    public function expandParameters($paramArray, $pid)
857
    {
858
        // Traverse parameter names:
859 2
        foreach ($paramArray as $p => $v) {
860 2
            $v = trim($v);
861
862
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
863 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
864
                // So, find the value inside brackets and reset the paramArray value as an array.
865 2
                $v = substr($v, 1, -1);
866 2
                $paramArray[$p] = [];
867
868
                // Explode parts and traverse them:
869 2
                $parts = explode('|', $v);
870 2
                foreach ($parts as $pV) {
871
872
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
873 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
874
875
                        // Swap if first is larger than last:
876
                        if ($reg[1] > $reg[2]) {
877
                            $temp = $reg[2];
878
                            $reg[2] = $reg[1];
879
                            $reg[1] = $temp;
880
                        }
881
882
                        // Traverse range, add values:
883
                        $runAwayBrake = 1000; // Limit to size of range!
884
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
885
                            $paramArray[$p][] = $a;
886
                            $runAwayBrake--;
887
                            if ($runAwayBrake <= 0) {
888
                                break;
889
                            }
890
                        }
891 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
892
893
                        // Parse parameters:
894
                        $subparts = GeneralUtility::trimExplode(';', $pV);
895
                        $subpartParams = [];
896
                        foreach ($subparts as $spV) {
897
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
898
                            $subpartParams[$pKey] = $pVal;
899
                        }
900
901
                        // Table exists:
902
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
903
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
904
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
905
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
906
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
907
908
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
909
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
910
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
911
912
                                $queryBuilder->getRestrictions()
913
                                    ->removeAll()
914
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
915
916
                                $queryBuilder
917
                                    ->select($fieldName)
918
                                    ->from($subpartParams['_TABLE'])
919
                                    // TODO: Check if this works as intended!
920
                                    ->add('from', $addTable)
921
                                    ->where(
922
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
923
                                        $where
924
                                    );
925
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
926
927
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
928
                                    $queryBuilder->andWhere(
929
                                        $queryBuilder->expr()->lte(
930
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
931
                                            0
932
                                        )
933
                                    );
934
                                }
935
936
                                $statement = $queryBuilder->execute();
937
938
                                $rows = [];
939
                                while ($row = $statement->fetch()) {
940
                                    $rows[$fieldName] = $row;
941
                                }
942
943
                                if (is_array($rows)) {
944
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
945
                                }
946
                            }
947
                        }
948
                    } else { // Just add value:
949 2
                        $paramArray[$p][] = $pV;
950
                    }
951
                    // Hook for processing own expandParameters place holder
952 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
953
                        $_params = [
954
                            'pObj' => &$this,
955
                            'paramArray' => &$paramArray,
956
                            'currentKey' => $p,
957
                            'currentValue' => $pV,
958
                            'pid' => $pid
959
                        ];
960
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
961
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
962
                        }
963
                    }
964
                }
965
966
                // Make unique set of values and sort array by key:
967 2
                $paramArray[$p] = array_unique($paramArray[$p]);
968 2
                ksort($paramArray);
969
            } else {
970
                // Set the literal value as only value in array:
971 2
                $paramArray[$p] = [$v];
972
            }
973
        }
974
975 2
        return $paramArray;
976
    }
977
978
    /**
979
     * Compiling URLs from parameter array (output of expandParameters())
980
     * The number of URLs will be the multiplication of the number of parameter values for each key
981
     *
982
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
983
     * @param array $urls URLs accumulated in this array (for recursion)
984
     * @return array
985
     */
986 5
    public function compileUrls($paramArray, $urls = [])
987
    {
988 5
        if (!empty($paramArray) && is_array($urls)) {
989
            // shift first off stack:
990 4
            reset($paramArray);
991 4
            $varName = key($paramArray);
992 4
            $valueSet = array_shift($paramArray);
993
994
            // Traverse value set:
995 4
            $newUrls = [];
996 4
            $maxCompileUrls = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
997 4
            foreach ($urls as $url) {
998 3
                foreach ($valueSet as $val) {
999 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1000
1001 3
                    if (count($newUrls) > $maxCompileUrls) {
1002
                        break;
1003
                    }
1004
                }
1005
            }
1006 4
            $urls = $newUrls;
1007 4
            $urls = $this->compileUrls($paramArray, $urls);
1008
        }
1009
1010 5
        return $urls;
1011
    }
1012
1013
    /************************************
1014
     *
1015
     * Crawler log
1016
     *
1017
     ************************************/
1018
1019
    /**
1020
     * Return array of records from crawler queue for input page ID
1021
     *
1022
     * @param integer $id Page ID for which to look up log entries.
1023
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1024
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1025
     * @param boolean $doFullFlush
1026
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1027
     * @return array
1028
     */
1029 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1030
    {
1031 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1032
        $queryBuilder
1033 4
            ->select('*')
1034 4
            ->from($this->tableName)
1035 4
            ->where(
1036 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1037
            )
1038 4
            ->orderBy('scheduled', 'DESC');
1039
1040 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1041 4
            ->getConnectionForTable($this->tableName)
1042 4
            ->getExpressionBuilder();
1043 4
        $query = $expressionBuilder->andX();
1044
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1045
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1046
        // between the statements, it's not a mistake in the code.
1047 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1048 4
        switch ($filter) {
1049 4
            case 'pending':
1050
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1051
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1052
                break;
1053 4
            case 'finished':
1054
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1055
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1056
                break;
1057
        }
1058
1059
        // FIXME: Write unit test that ensures that the right records are deleted.
1060 4
        if ($doFlush) {
1061 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1062 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1063 2
            return [];
1064
        } else {
1065 2
            if ($itemsPerPage > 0) {
1066
                $queryBuilder
1067 2
                    ->setMaxResults((int)$itemsPerPage);
1068
            }
1069
1070 2
            return $queryBuilder->execute()->fetchAll();
1071
        }
1072
    }
1073
1074
    /**
1075
     * Return array of records from crawler queue for input set ID
1076
     *
1077
     * @param integer $set_id Set ID for which to look up log entries.
1078
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1079
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1080
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1081
     * @return array
1082
     */
1083 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1084
    {
1085 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1086
        $queryBuilder
1087 6
            ->select('*')
1088 6
            ->from($this->tableName)
1089 6
            ->where(
1090 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1091
            )
1092 6
            ->orderBy('scheduled', 'DESC');
1093
1094 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1095 6
            ->getConnectionForTable($this->tableName)
1096 6
            ->getExpressionBuilder();
1097 6
        $query = $expressionBuilder->andX();
1098
        // FIXME: Write Unit tests for Filters
1099
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1100
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1101
        // between the statements, it's not a mistake in the code.
1102 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1103 6
        switch ($filter) {
1104 6
            case 'pending':
1105 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1106 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1107 1
                break;
1108 5
            case 'finished':
1109 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1110 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1111 1
                break;
1112
        }
1113
        // FIXME: Write unit test that ensures that the right records are deleted.
1114 6
        if ($doFlush) {
1115 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1116 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1117 4
            return [];
1118
        } else {
1119 2
            if ($itemsPerPage > 0) {
1120
                $queryBuilder
1121 2
                    ->setMaxResults((int)$itemsPerPage);
1122
            }
1123
1124 2
            return $queryBuilder->execute()->fetchAll();
1125
        }
1126
    }
1127
1128
    /**
1129
     * Removes queue entries
1130
     *
1131
     * @param string $where SQL related filter for the entries which should be removed
1132
     * @return void
1133
     */
1134 9
    protected function flushQueue($where = '')
1135
    {
1136 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1137
1138 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1139
1140 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1141
            $groups = $queryBuilder
1142
                ->select('DISTINCT set_id')
1143
                ->from($this->tableName)
1144
                ->where($realWhere)
1145
                ->execute()
1146
                ->fetchAll();
1147
            if (is_array($groups)) {
1148
                foreach ($groups as $group) {
1149
                    $subSet = $queryBuilder
1150
                        ->select('uid', 'set_id')
1151
                        ->from($this->tableName)
1152
                        ->where(
1153
                            $realWhere,
1154
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1155
                        )
1156
                        ->execute()
1157
                        ->fetchAll();
1158
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1159
                }
1160
            }
1161
        }
1162
1163
        $queryBuilder
1164 9
            ->delete($this->tableName)
1165 9
            ->where($realWhere)
1166 9
            ->execute();
1167 9
    }
1168
1169
    /**
1170
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1171
     *
1172
     * @param integer $setId Set ID
1173
     * @param array $params Parameters to pass to call back function
1174
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1175
     * @param integer $page_id Page ID to attach it to
1176
     * @param integer $schedule Time at which to activate
1177
     * @return void
1178
     */
1179
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1180
    {
1181
        if (!is_array($params)) {
1182
            $params = [];
1183
        }
1184
        $params['_CALLBACKOBJ'] = $callBack;
1185
1186
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1187
            ->insert(
1188
                'tx_crawler_queue',
1189
                [
1190
                    'page_id' => intval($page_id),
1191
                    'parameters' => serialize($params),
1192
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1193
                    'exec_time' => 0,
1194
                    'set_id' => intval($setId),
1195
                    'result_data' => '',
1196
                ]
1197
            );
1198
    }
1199
1200
    /************************************
1201
     *
1202
     * URL setting
1203
     *
1204
     ************************************/
1205
1206
    /**
1207
     * Setting a URL for crawling:
1208
     *
1209
     * @param integer $id Page ID
1210
     * @param string $url Complete URL
1211
     * @param array $subCfg Sub configuration array (from TS config)
1212
     * @param integer $tstamp Scheduled-time
1213
     * @param string $configurationHash (optional) configuration hash
1214
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1215
     * @return bool
1216
     */
1217 2
    public function addUrl(
1218
        $id,
1219
        $url,
1220
        array $subCfg,
1221
        $tstamp,
1222
        $configurationHash = '',
1223
        $skipInnerDuplicationCheck = false
1224
    ) {
1225 2
        $urlAdded = false;
1226 2
        $rows = [];
1227
1228
        // Creating parameters:
1229
        $parameters = [
1230 2
            'url' => $url
1231
        ];
1232
1233
        // fe user group simulation:
1234 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1235 2
        if ($uGs) {
1236
            $parameters['feUserGroupList'] = $uGs;
1237
        }
1238
1239
        // Setting processing instructions
1240 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1241 2
        if (is_array($subCfg['procInstrParams.'])) {
1242 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1243
        }
1244
1245
        // Possible TypoScript Template Parents
1246 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1247
1248
        // Compile value array:
1249 2
        $parameters_serialized = serialize($parameters);
1250
        $fieldArray = [
1251 2
            'page_id' => intval($id),
1252 2
            'parameters' => $parameters_serialized,
1253 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1254 2
            'configuration_hash' => $configurationHash,
1255 2
            'scheduled' => $tstamp,
1256 2
            'exec_time' => 0,
1257 2
            'set_id' => intval($this->setID),
1258 2
            'result_data' => '',
1259 2
            'configuration' => $subCfg['key'],
1260
        ];
1261
1262 2
        if ($this->registerQueueEntriesInternallyOnly) {
1263
            //the entries will only be registered and not stored to the database
1264
            $this->queueEntries[] = $fieldArray;
1265
        } else {
1266 2
            if (!$skipInnerDuplicationCheck) {
1267
                // check if there is already an equal entry
1268 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1269
            }
1270
1271 2
            if (empty($rows)) {
1272 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1273 2
                $connectionForCrawlerQueue->insert(
1274 2
                    'tx_crawler_queue',
1275 2
                    $fieldArray
1276
                );
1277 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1278 2
                $rows[] = $uid;
1279 2
                $urlAdded = true;
1280 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1281
            } else {
1282
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1283
            }
1284
        }
1285
1286 2
        return $urlAdded;
1287
    }
1288
1289
    /**
1290
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1291
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1292
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1293
     *
1294
     * @param int $tstamp
1295
     * @param array $fieldArray
1296
     *
1297
     * @return array
1298
     *
1299
     * TODO: Write Functional Tests
1300
     */
1301 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1302
    {
1303 2
        $rows = [];
1304
1305 2
        $currentTime = $this->getCurrentTime();
1306
1307 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1308
        $queryBuilder
1309 2
            ->select('qid')
1310 2
            ->from('tx_crawler_queue');
1311
        //if this entry is scheduled with "now"
1312 2
        if ($tstamp <= $currentTime) {
1313
            if ($this->extensionSettings['enableTimeslot']) {
1314
                $timeBegin = $currentTime - 100;
1315
                $timeEnd = $currentTime + 100;
1316
                $queryBuilder
1317
                    ->where(
1318
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1319
                    )
1320
                    ->orWhere(
1321
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1322
                    );
1323
            } else {
1324
                $queryBuilder
1325
                    ->where(
1326
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1327
                    );
1328
            }
1329 2
        } elseif ($tstamp > $currentTime) {
1330
            //entry with a timestamp in the future need to have the same schedule time
1331
            $queryBuilder
1332 2
                ->where(
1333 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1334
                );
1335
        }
1336
1337
        $statement = $queryBuilder
1338 2
            ->andWhere('exec_time != 0')
1339 2
            ->andWhere('process_id != 0')
1340 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1341 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1342 2
            ->execute();
1343
1344 2
        while ($row = $statement->fetch()) {
1345
            $rows[] = $row['qid'];
1346
        }
1347
1348 2
        return $rows;
1349
    }
1350
1351
    /**
1352
     * Returns the current system time
1353
     *
1354
     * @return int
1355
     */
1356
    public function getCurrentTime()
1357
    {
1358
        return time();
1359
    }
1360
1361
    /************************************
1362
     *
1363
     * URL reading
1364
     *
1365
     ************************************/
1366
1367
    /**
1368
     * Read URL for single queue entry
1369
     *
1370
     * @param integer $queueId
1371
     * @param boolean $force If set, will process even if exec_time has been set!
1372
     * @return integer
1373
     */
1374
    public function readUrl($queueId, $force = false)
1375
    {
1376
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1377
        $ret = 0;
1378
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1379
        // Get entry:
1380
        $queryBuilder
1381
            ->select('*')
1382
            ->from('tx_crawler_queue')
1383
            ->where(
1384
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1385
            );
1386
        if (!$force) {
1387
            $queryBuilder
1388
                ->andWhere('exec_time = 0')
1389
                ->andWhere('process_scheduled > 0');
1390
        }
1391
        $queueRec = $queryBuilder->execute()->fetch();
1392
1393
        if (!is_array($queueRec)) {
1394
            return;
1395
        }
1396
1397
        $parameters = unserialize($queueRec['parameters']);
1398
        if ($parameters['rootTemplatePid']) {
1399
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1400
        } else {
1401
            $this->logger->warning(
1402
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1403
            );
1404
        }
1405
1406
        SignalSlotUtility::emitSignal(
1407
            __CLASS__,
1408
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1409
            [$queueId, &$queueRec]
1410
        );
1411
1412
        // Set exec_time to lock record:
1413
        $field_array = ['exec_time' => $this->getCurrentTime()];
1414
1415
        if (isset($this->processID)) {
1416
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1417
            $field_array['process_id_completed'] = $this->processID;
1418
        }
1419
1420
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1421
            ->update(
1422
                'tx_crawler_queue',
1423
                $field_array,
1424
                [ 'qid' => (int)$queueId ]
1425
            );
1426
1427
        $result = $this->readUrl_exec($queueRec);
1428
        $resultData = unserialize($result['content']);
1429
1430
        //atm there's no need to point to specific pollable extensions
1431
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1432
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1433
                // only check the success value if the instruction is runnig
1434
                // it is important to name the pollSuccess key same as the procInstructions key
1435
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1436
                    $pollable,
1437
                    $resultData['parameters']['procInstructions']
1438
                )
1439
                ) {
1440
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1441
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1442
                    }
1443
                }
1444
            }
1445
        }
1446
1447
        // Set result in log which also denotes the end of the processing of this entry.
1448
        $field_array = ['result_data' => serialize($result)];
1449
1450
        SignalSlotUtility::emitSignal(
1451
            __CLASS__,
1452
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1453
            [$queueId, &$field_array]
1454
        );
1455
1456
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1457
            ->update(
1458
                'tx_crawler_queue',
1459
                $field_array,
1460
                [ 'qid' => (int)$queueId ]
1461
            );
1462
1463
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1464
        return $ret;
1465
    }
1466
1467
    /**
1468
     * Read URL for not-yet-inserted log-entry
1469
     *
1470
     * @param array $field_array Queue field array,
1471
     *
1472
     * @return string
1473
     */
1474
    public function readUrlFromArray($field_array)
1475
    {
1476
1477
            // Set exec_time to lock record:
1478
        $field_array['exec_time'] = $this->getCurrentTime();
1479
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1480
        $connectionForCrawlerQueue->insert(
1481
            'tx_crawler_queue',
1482
            $field_array
1483
        );
1484
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1485
1486
        $result = $this->readUrl_exec($field_array);
1487
1488
        // Set result in log which also denotes the end of the processing of this entry.
1489
        $field_array = ['result_data' => serialize($result)];
1490
1491
        SignalSlotUtility::emitSignal(
1492
            __CLASS__,
1493
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1494
            [$queueId, &$field_array]
1495
        );
1496
1497
        $connectionForCrawlerQueue->update(
1498
            'tx_crawler_queue',
1499
            $field_array,
1500
            ['qid' => $queueId]
1501
        );
1502
1503
        return $result;
1504
    }
1505
1506
    /**
1507
     * Read URL for a queue record
1508
     *
1509
     * @param array $queueRec Queue record
1510
     * @return string
1511
     */
1512
    public function readUrl_exec($queueRec)
1513
    {
1514
        // Decode parameters:
1515
        $parameters = unserialize($queueRec['parameters']);
1516
        $result = 'ERROR';
1517
        if (is_array($parameters)) {
1518
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1519
                $objRef = $parameters['_CALLBACKOBJ'];
1520
                $callBackObj = GeneralUtility::makeInstance($objRef);
1521
                if (is_object($callBackObj)) {
1522
                    unset($parameters['_CALLBACKOBJ']);
1523
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1524
                } else {
1525
                    $result = ['content' => 'No object: ' . $objRef];
1526
                }
1527
            } else { // Regular FE request:
1528
1529
                // Prepare:
1530
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1531
1532
                // Get result:
1533
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1534
1535
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1536
            }
1537
        }
1538
1539
        return $result;
1540
    }
1541
1542
    /**
1543
     * Gets the content of a URL.
1544
     *
1545
     * @param string $originalUrl URL to read
1546
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1547
     * @param integer $timeout Timeout time
1548
     * @param integer $recursion Recursion limiter for 302 redirects
1549
     * @return array|boolean
1550
     */
1551 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1552
    {
1553 2
        if (!$recursion) {
1554
            return false;
1555
        }
1556
1557
        // Parse URL, checking for scheme:
1558 2
        $url = parse_url($originalUrl);
1559
1560 2
        if ($url === false) {
1561
            $this->logger->debug(
1562
                sprintf('Could not parse_url() for string "%s"', $url),
1563
                ['crawlerId' => $crawlerId]
1564
            );
1565
            return false;
1566
        }
1567
1568 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1569
            $this->logger->debug(
1570
                sprintf('Scheme does not match for url "%s"', $url),
1571
                ['crawlerId' => $crawlerId]
1572
            );
1573
            return false;
1574
        }
1575
1576
        // direct request
1577 2
        if ($this->extensionSettings['makeDirectRequests']) {
1578 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1579 2
            return $result;
1580
        }
1581
1582
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1583
1584
        // thanks to Pierrick Caillon for adding proxy support
1585
        $rurl = $url;
1586
1587
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1588
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1589
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1590
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1591
        }
1592
1593
        $host = $rurl['host'];
1594
1595
        if ($url['scheme'] == 'https') {
1596
            $host = 'ssl://' . $host;
1597
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1598
        } else {
1599
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1600
        }
1601
1602
        $startTime = microtime(true);
1603
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1604
1605
        if (!$fp) {
1606
            $this->logger->debug(
1607
                sprintf('Error while opening "%s"', $url),
1608
                ['crawlerId' => $crawlerId]
1609
            );
1610
            return false;
1611
        } else {
1612
            // Request message:
1613
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1614
            fputs($fp, $msg);
1615
1616
            // Read response:
1617
            $d = $this->getHttpResponseFromStream($fp);
1618
            fclose($fp);
1619
1620
            $time = microtime(true) - $startTime;
1621
            $this->log($originalUrl . ' ' . $time);
1622
1623
            // Implode content and headers:
1624
            $result = [
1625
                'request' => $msg,
1626
                'headers' => implode('', $d['headers']),
1627
                'content' => implode('', (array)$d['content'])
1628
            ];
1629
1630
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1631
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1632
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1633
1634
                if (is_array($newRequestUrl)) {
1635
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1636
                } else {
1637
                    $this->logger->debug(
1638
                        sprintf('Error while opening "%s"', $url),
1639
                        ['crawlerId' => $crawlerId]
1640
                    );
1641
                    return false;
1642
                }
1643
            }
1644
1645
            return $result;
1646
        }
1647
    }
1648
1649
    /**
1650
     * Gets the base path of the website frontend.
1651
     * (e.g. if you call http://mydomain.com/cms/index.php in
1652
     * the browser the base path is "/cms/")
1653
     *
1654
     * @return string Base path of the website frontend
1655
     */
1656
    protected function getFrontendBasePath()
1657
    {
1658
        $frontendBasePath = '/';
1659
1660
        // Get the path from the extension settings:
1661
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1662
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1663
        // If empty, try to use config.absRefPrefix:
1664
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1665
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1666
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1667
        } elseif (!Environment::isCli()) {
1668
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1669
        }
1670
1671
        // Base path must be '/<pathSegements>/':
1672
        if ($frontendBasePath !== '/') {
1673
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1674
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1675
        }
1676
1677
        return $frontendBasePath;
1678
    }
1679
1680
    /**
1681
     * Executes a shell command and returns the outputted result.
1682
     *
1683
     * @param string $command Shell command to be executed
1684
     * @return string Outputted result of the command execution
1685
     */
1686
    protected function executeShellCommand($command)
1687
    {
1688
        return shell_exec($command);
1689
    }
1690
1691
    /**
1692
     * Reads HTTP response from the given stream.
1693
     *
1694
     * @param  resource $streamPointer  Pointer to connection stream.
1695
     * @return array                    Associative array with the following items:
1696
     *                                  headers <array> Response headers sent by server.
1697
     *                                  content <array> Content, with each line as an array item.
1698
     */
1699 1
    protected function getHttpResponseFromStream($streamPointer)
1700
    {
1701 1
        $response = ['headers' => [], 'content' => []];
1702
1703 1
        if (is_resource($streamPointer)) {
1704
            // read headers
1705 1
            while ($line = fgets($streamPointer, '2048')) {
1706 1
                $line = trim($line);
1707 1
                if ($line !== '') {
1708 1
                    $response['headers'][] = $line;
1709
                } else {
1710 1
                    break;
1711
                }
1712
            }
1713
1714
            // read content
1715 1
            while ($line = fgets($streamPointer, '2048')) {
1716 1
                $response['content'][] = $line;
1717
            }
1718
        }
1719
1720 1
        return $response;
1721
    }
1722
1723
    /**
1724
     * In the future this setting "logFileName" should be removed in favor of using the TYPO3 Logging Framework
1725
     * @param string the message string to log
1726
     */
1727 2
    protected function log(string $message): void
1728
    {
1729 2
        if (!empty($this->extensionSettings['logFileName'])) {
1730
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1731
        }
1732 2
        $this->logger->info(
1733 2
            sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1734
        );
1735 2
    }
1736
1737
    /**
1738
     * Builds HTTP request headers.
1739
     *
1740
     * @param array $url
1741
     * @param string $crawlerId
1742
     *
1743
     * @return array
1744
     */
1745 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1746
    {
1747 6
        $reqHeaders = [];
1748 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1749 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1750 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1751 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1752
        }
1753 6
        $reqHeaders[] = 'Connection: close';
1754 6
        if ($url['user'] != '') {
1755 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1756
        }
1757 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1758 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1759 6
        return $reqHeaders;
1760
    }
1761
1762
    /**
1763
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1764
     *
1765
     * @param array $headers HTTP Header
1766
     * @param string $user HTTP Auth. User
1767
     * @param string $pass HTTP Auth. Password
1768
     * @return bool|string
1769
     */
1770 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1771
    {
1772 12
        $header = [];
1773 12
        if (!is_array($headers)) {
1774 1
            return false;
1775
        }
1776 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1777 2
            return false;
1778
        }
1779
1780 9
        foreach ($headers as $hl) {
1781 9
            $tmp = explode(": ", $hl);
1782 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1783 9
            if (trim($tmp[0]) == 'Location') {
1784 6
                break;
1785
            }
1786
        }
1787 9
        if (!array_key_exists('Location', $header)) {
1788 3
            return false;
1789
        }
1790
1791 6
        if ($user != '') {
1792 3
            if (!($tmp = parse_url($header['Location']))) {
1793 1
                return false;
1794
            }
1795 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1796 2
            if ($tmp['query'] != '') {
1797 2
                $newUrl .= '?' . $tmp['query'];
1798
            }
1799
        } else {
1800 3
            $newUrl = $header['Location'];
1801
        }
1802 5
        return $newUrl;
1803
    }
1804
1805
    /**************************
1806
     *
1807
     * tslib_fe hooks:
1808
     *
1809
     **************************/
1810
1811
    /**
1812
     * Initialization hook (called after database connection)
1813
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1814
     *
1815
     * @param array $params Parameters from frontend
1816
     * @param object $ref TSFE object (reference under PHP5)
1817
     * @return void
1818
     *
1819
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1820
     * FIXME: I think this can be removed. (TNM)
1821
     */
1822
    public function fe_init(&$params, $ref)
1823
    {
1824
        // Authenticate crawler request:
1825
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1826
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1827
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1828
1829
            $queueRec = $queryBuilder
1830
                ->select('*')
1831
                ->from('tx_crawler_queue')
1832
                ->where(
1833
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1834
                )
1835
                ->execute()
1836
                ->fetch();
1837
1838
            // If a crawler record was found and hash was matching, set it up:
1839
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1840
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1841
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1842
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1843
            } else {
1844
                die('No crawler entry found!');
1845
            }
1846
        }
1847
    }
1848
1849
    /*****************************
1850
     *
1851
     * Compiling URLs to crawl - tools
1852
     *
1853
     *****************************/
1854
1855
    /**
1856
     * @param integer $id Root page id to start from.
1857
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1858
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1859
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1860
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1861
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1862
     * @param array $incomingProcInstructions Array of processing instructions
1863
     * @param array $configurationSelection Array of configuration keys
1864
     * @return string
1865
     */
1866
    public function getPageTreeAndUrls(
1867
        $id,
1868
        $depth,
1869
        $scheduledTime,
1870
        $reqMinute,
1871
        $submitCrawlUrls,
1872
        $downloadCrawlUrls,
1873
        array $incomingProcInstructions,
1874
        array $configurationSelection
1875
    ) {
1876
        $this->scheduledTime = $scheduledTime;
1877
        $this->reqMinute = $reqMinute;
1878
        $this->submitCrawlUrls = $submitCrawlUrls;
1879
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1880
        $this->incomingProcInstructions = $incomingProcInstructions;
1881
        $this->incomingConfigurationSelection = $configurationSelection;
1882
1883
        $this->duplicateTrack = [];
1884
        $this->downloadUrls = [];
1885
1886
        // Drawing tree:
1887
        /* @var PageTreeView $tree */
1888
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1889
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1890
        $tree->init('AND ' . $perms_clause);
1891
1892
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1893
        if (is_array($pageInfo)) {
1894
            // Set root row:
1895
            $tree->tree[] = [
1896
                'row' => $pageInfo,
1897
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1898
            ];
1899
        }
1900
1901
        // Get branch beneath:
1902
        if ($depth) {
1903
            $tree->getTree($id, $depth, '');
1904
        }
1905
1906
        // Traverse page tree:
1907
        $code = '';
1908
1909
        foreach ($tree->tree as $data) {
1910
            $this->MP = false;
1911
1912
            // recognize mount points
1913
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1914
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1915
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1916
                $mountpage = $queryBuilder
1917
                    ->select('*')
1918
                    ->from('pages')
1919
                    ->where(
1920
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1921
                    )
1922
                    ->execute()
1923
                    ->fetchAll();
1924
                $queryBuilder->getRestrictions()->reset();
1925
1926
                // fetch mounted pages
1927
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1928
1929
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1930
                $mountTree->init('AND ' . $perms_clause);
1931
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1932
1933
                foreach ($mountTree->tree as $mountData) {
1934
                    $code .= $this->drawURLs_addRowsForPage(
1935
                        $mountData['row'],
1936
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1937
                    );
1938
                }
1939
1940
                // replace page when mount_pid_ol is enabled
1941
                if ($mountpage[0]['mount_pid_ol']) {
1942
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1943
                } else {
1944
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1945
                    $this->MP = false;
1946
                }
1947
            }
1948
1949
            $code .= $this->drawURLs_addRowsForPage(
1950
                $data['row'],
1951
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1952
            );
1953
        }
1954
1955
        return $code;
1956
    }
1957
1958
    /**
1959
     * Expands exclude string
1960
     *
1961
     * @param string $excludeString Exclude string
1962
     * @return array
1963
     */
1964 1
    public function expandExcludeString($excludeString)
1965
    {
1966
        // internal static caches;
1967 1
        static $expandedExcludeStringCache;
1968 1
        static $treeCache;
1969
1970 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1971 1
            $pidList = [];
1972
1973 1
            if (!empty($excludeString)) {
1974
                /** @var PageTreeView $tree */
1975
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1976
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1977
1978
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1979
1980
                foreach ($excludeParts as $excludePart) {
1981
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1982
1983
                    // default is "page only" = "depth=0"
1984
                    if (empty($depth)) {
1985
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1986
                    }
1987
1988
                    $pidList[] = $pid;
1989
1990
                    if ($depth > 0) {
1991
                        if (empty($treeCache[$pid][$depth])) {
1992
                            $tree->reset();
1993
                            $tree->getTree($pid, $depth);
1994
                            $treeCache[$pid][$depth] = $tree->tree;
1995
                        }
1996
1997
                        foreach ($treeCache[$pid][$depth] as $data) {
1998
                            $pidList[] = $data['row']['uid'];
1999
                        }
2000
                    }
2001
                }
2002
            }
2003
2004 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2005
        }
2006
2007 1
        return $expandedExcludeStringCache[$excludeString];
2008
    }
2009
2010
    /**
2011
     * Create the rows for display of the page tree
2012
     * For each page a number of rows are shown displaying GET variable configuration
2013
     *
2014
     * @param    array        Page row
2015
     * @param    string        Page icon and title for row
2016
     * @return    string        HTML <tr> content (one or more)
2017
     */
2018
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2019
    {
2020
        $skipMessage = '';
2021
2022
        // Get list of configurations
2023
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2024
2025
        if (!empty($this->incomingConfigurationSelection)) {
2026
            // remove configuration that does not match the current selection
2027
            foreach ($configurations as $confKey => $confArray) {
2028
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2029
                    unset($configurations[$confKey]);
2030
                }
2031
            }
2032
        }
2033
2034
        // Traverse parameter combinations:
2035
        $c = 0;
2036
        $content = '';
2037
        if (!empty($configurations)) {
2038
            foreach ($configurations as $confKey => $confArray) {
2039
2040
                    // Title column:
2041
                if (!$c) {
2042
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2043
                } else {
2044
                    $titleClm = '';
2045
                }
2046
2047
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2048
2049
                        // URL list:
2050
                    $urlList = $this->urlListFromUrlArray(
2051
                        $confArray,
2052
                        $pageRow,
2053
                        $this->scheduledTime,
2054
                        $this->reqMinute,
2055
                        $this->submitCrawlUrls,
2056
                        $this->downloadCrawlUrls,
2057
                        $this->duplicateTrack,
2058
                        $this->downloadUrls,
2059
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2060
                    );
2061
2062
                    // Expanded parameters:
2063
                    $paramExpanded = '';
2064
                    $calcAccu = [];
2065
                    $calcRes = 1;
2066
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2067
                        $paramExpanded .= '
2068
                            <tr>
2069
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2070
                                                '(' . count($gVal) . ')' .
2071
                                                '</td>
2072
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2073
                            </tr>
2074
                        ';
2075
                        $calcRes *= count($gVal);
2076
                        $calcAccu[] = count($gVal);
2077
                    }
2078
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2079
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2080
2081
                    // Options
2082
                    $optionValues = '';
2083
                    if ($confArray['subCfg']['userGroups']) {
2084
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2085
                    }
2086
                    if ($confArray['subCfg']['baseUrl']) {
2087
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2088
                    }
2089
                    if ($confArray['subCfg']['procInstrFilter']) {
2090
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2091
                    }
2092
2093
                    // Compile row:
2094
                    $content .= '
2095
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2096
                            ' . $titleClm . '
2097
                            <td>' . htmlspecialchars($confKey) . '</td>
2098
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2099
                            <td>' . $paramExpanded . '</td>
2100
                            <td nowrap="nowrap">' . $urlList . '</td>
2101
                            <td nowrap="nowrap">' . $optionValues . '</td>
2102
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2103
                        </tr>';
2104
                } else {
2105
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2106
                            ' . $titleClm . '
2107
                            <td>' . htmlspecialchars($confKey) . '</td>
2108
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2109
                        </tr>';
2110
                }
2111
2112
                $c++;
2113
            }
2114
        } else {
2115
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2116
2117
            // Compile row:
2118
            $content .= '
2119
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2120
                    <td>' . $pageTitleAndIcon . '</td>
2121
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2122
                </tr>';
2123
        }
2124
2125
        return $content;
2126
    }
2127
2128
    /*****************************
2129
     *
2130
     * CLI functions
2131
     *
2132
     *****************************/
2133
2134
    /**
2135
     * Running the functionality of the CLI (crawling URLs from queue)
2136
     *
2137
     * @param int $countInARun
2138
     * @param int $sleepTime
2139
     * @param int $sleepAfterFinish
2140
     * @return string
2141
     */
2142
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2143
    {
2144
        $result = 0;
2145
        $counter = 0;
2146
2147
        // First, run hooks:
2148
        $this->CLI_runHooks();
2149
2150
        // Clean up the queue
2151
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2152
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2153
2154
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2155
            $del = $queryBuilderDelete
2156
                ->delete($this->tableName)
2157
                ->where(
2158
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2159
                )->execute();
2160
2161
            if (false === $del) {
2162
                $this->logger->info(
2163
                    'Records could not be deleted.'
2164
                );
2165
            }
2166
        }
2167
2168
        // Select entries:
2169
        //TODO Shouldn't this reside within the transaction?
2170
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2171
        $rows = $queryBuilderSelect
2172
            ->select('qid', 'scheduled')
2173
            ->from('tx_crawler_queue')
2174
            ->where(
2175
                $queryBuilderSelect->expr()->eq('exec_time', 0),
2176
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
2177
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
2178
            )
2179
            ->orderBy('scheduled')
2180
            ->addOrderBy('qid')
2181
            ->setMaxResults($countInARun)
2182
            ->execute()
2183
            ->fetchAll();
2184
2185
        if (!empty($rows)) {
2186
            $quidList = [];
2187
2188
            foreach ($rows as $r) {
2189
                $quidList[] = $r['qid'];
2190
            }
2191
2192
            $processId = $this->CLI_buildProcessId();
2193
2194
            //reserve queue entries for process
2195
2196
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2197
            //TODO make sure we're not taking assigned queue-entires
2198
2199
            //save the number of assigned queue entrys to determine who many have been processed later
2200
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2201
            $numberOfAffectedRows = $queryBuilderUpdate
2202
                ->update('tx_crawler_queue')
2203
                ->where(
2204
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
2205
                )
2206
                ->set('process_scheduled', $this->getCurrentTime())
2207
                ->set('process_id', $queryBuilderUpdate->createNamedParameter($processId, \PDO::PARAM_STR))
2208
                ->execute();
2209
2210
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2211
                ->update(
2212
                    'tx_crawler_process',
2213
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2214
                    [ 'process_id' => (int) $processId ]
2215
                );
2216
2217
            if ($numberOfAffectedRows == count($quidList)) {
2218
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2219
            } else {
2220
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2221
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2222
                return ($result | self::CLI_STATUS_ABORTED);
2223
            }
2224
2225
            foreach ($rows as $r) {
2226
                $result |= $this->readUrl($r['qid']);
2227
2228
                $counter++;
2229
                usleep(intval($sleepTime)); // Just to relax the system
2230
2231
                // if during the start and the current read url the cli has been disable we need to return from the function
2232
                // mark the process NOT as ended.
2233
                if ($this->getDisabled()) {
2234
                    return ($result | self::CLI_STATUS_ABORTED);
2235
                }
2236
2237
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2238
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2239
2240
                    //TODO might need an additional returncode
2241
                    $result |= self::CLI_STATUS_ABORTED;
2242
                    break; //possible timeout
2243
                }
2244
            }
2245
2246
            sleep(intval($sleepAfterFinish));
2247
2248
            $msg = 'Rows: ' . $counter;
2249
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2250
        } else {
2251
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2252
        }
2253
2254
        if ($counter > 0) {
2255
            $result |= self::CLI_STATUS_PROCESSED;
2256
        }
2257
2258
        return $result;
2259
    }
2260
2261
    /**
2262
     * Activate hooks
2263
     *
2264
     * @return void
2265
     */
2266
    public function CLI_runHooks()
2267
    {
2268
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
2269
            $hookObj = GeneralUtility::makeInstance($objRef);
2270
            if (is_object($hookObj)) {
2271
                $hookObj->crawler_init($this);
2272
            }
2273
        }
2274
    }
2275
2276
    /**
2277
     * Try to acquire a new process with the given id
2278
     * also performs some auto-cleanup for orphan processes
2279
     * @todo preemption might not be the most elegant way to clean up
2280
     *
2281
     * @param string $id identification string for the process
2282
     * @return boolean
2283
     */
2284
    public function CLI_checkAndAcquireNewProcess($id)
2285
    {
2286
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2287
        $ret = true;
2288
2289
        $systemProcessId = getmypid();
2290
        if ($systemProcessId < 1) {
2291
            return false;
2292
        }
2293
2294
        $processCount = 0;
2295
        $orphanProcesses = [];
2296
2297
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2298
2299
        $statement = $queryBuilder
2300
            ->select('process_id', 'ttl')
2301
            ->from('tx_crawler_process')
2302
            ->where(
2303
                'active = 1 AND deleted = 0'
2304
            )
2305
            ->execute();
2306
2307
        $currentTime = $this->getCurrentTime();
2308
2309
        while ($row = $statement->fetch()) {
2310
            if ($row['ttl'] < $currentTime) {
2311
                $orphanProcesses[] = $row['process_id'];
2312
            } else {
2313
                $processCount++;
2314
            }
2315
        }
2316
2317
        // if there are less than allowed active processes then add a new one
2318
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2319
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2320
2321
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2322
                'tx_crawler_process',
2323
                [
2324
                    'process_id' => $id,
2325
                    'active' => 1,
2326
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2327
                    'system_process_id' => $systemProcessId
2328
                ]
2329
            );
2330
        } else {
2331
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2332
            $ret = false;
2333
        }
2334
2335
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2336
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2337
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2338
2339
        return $ret;
2340
    }
2341
2342
    /**
2343
     * Release a process and the required resources
2344
     *
2345
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2346
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2347
     * @return boolean
2348
     */
2349
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2350
    {
2351
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2352
2353
        if (!is_array($releaseIds)) {
2354
            $releaseIds = [$releaseIds];
2355
        }
2356
2357
        if (empty($releaseIds)) {
2358
            return false;   //nothing to release
2359
        }
2360
2361
        if (!$withinLock) {
2362
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2363
        }
2364
2365
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2366
        // this ensures that a single process can't mess up the entire process table
2367
2368
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2369
2370
        $queryBuilder
2371
        ->update('tx_crawler_queue', 'q')
2372
        ->where(
2373
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2374
        )
2375
        ->set('q.process_scheduled', 0)
2376
        ->set('q.process_id', '')
2377
        ->execute();
2378
2379
        // FIXME: Not entirely sure that this is equivalent to the previous version
2380
        $queryBuilder->resetQueryPart('set');
2381
2382
        $queryBuilder
2383
            ->update('tx_crawler_process')
2384
            ->where(
2385
                $queryBuilder->expr()->eq('active', 0),
2386
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2387
            )
2388
            ->set('system_process_id', 0)
2389
            ->execute();
2390
        // previous version for reference
2391
        /*
2392
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2393
            'tx_crawler_process',
2394
            'active=0 AND deleted=0
2395
            AND NOT EXISTS (
2396
                SELECT * FROM tx_crawler_queue
2397
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2398
                AND tx_crawler_queue.exec_time = 0
2399
            )',
2400
            [
2401
                'deleted' => '1',
2402
                'system_process_id' => 0
2403
            ]
2404
        );*/
2405
        // mark all requested processes as non-active
2406
        $queryBuilder
2407
            ->update('tx_crawler_process')
2408
            ->where(
2409
                'NOT EXISTS (
2410
                SELECT * FROM tx_crawler_queue
2411
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2412
                    AND tx_crawler_queue.exec_time = 0
2413
                )',
2414
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2415
                $queryBuilder->expr()->eq('deleted', 0)
2416
            )
2417
            ->set('active', 0)
2418
            ->execute();
2419
        $queryBuilder->resetQueryPart('set');
2420
        $queryBuilder
2421
            ->update('tx_crawler_queue')
2422
            ->where(
2423
                $queryBuilder->expr()->eq('exec_time', 0),
2424
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2425
            )
2426
            ->set('process_scheduled', 0)
2427
            ->set('process_id', '')
2428
            ->execute();
2429
2430
        if (!$withinLock) {
2431
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2432
        }
2433
2434
        return true;
2435
    }
2436
2437
    /**
2438
     * Check if there are still resources left for the process with the given id
2439
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2440
     *
2441
     * @param  string  identification string for the process
2442
     * @return boolean determines if the process is still active / has resources
2443
     *
2444
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2445
     */
2446 1
    public function CLI_checkIfProcessIsActive($pid)
2447
    {
2448 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2449 1
        $ret = false;
2450
2451
        $statement = $queryBuilder
2452 1
            ->from('tx_crawler_process')
2453 1
            ->select('active')
2454 1
            ->where(
2455 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2456
            )
2457 1
            ->orderBy('ttl')
2458 1
            ->execute();
2459
2460 1
        if ($row = $statement->fetch(0)) {
2461 1
            $ret = intVal($row['active']) == 1;
2462
        }
2463
2464 1
        return $ret;
2465
    }
2466
2467
    /**
2468
     * Create a unique Id for the current process
2469
     *
2470
     * @return string  the ID
2471
     */
2472 2
    public function CLI_buildProcessId()
2473
    {
2474 2
        if (!$this->processID) {
2475 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2476
        }
2477 2
        return $this->processID;
2478
    }
2479
2480
    /**
2481
     * @param bool $get_as_float
2482
     *
2483
     * @return mixed
2484
     */
2485
    protected function microtime($get_as_float = false)
2486
    {
2487
        return microtime($get_as_float);
2488
    }
2489
2490
    /**
2491
     * Prints a message to the stdout (only if debug-mode is enabled)
2492
     *
2493
     * @param  string $msg  the message
2494
     */
2495
    public function CLI_debug($msg)
2496
    {
2497
        if (intval($this->extensionSettings['processDebug'])) {
2498
            echo $msg . "\n";
2499
            flush();
2500
        }
2501
    }
2502
2503
    /**
2504
     * Get URL content by making direct request to TYPO3.
2505
     *
2506
     * @param  string $url          Page URL
2507
     * @param  int    $crawlerId    Crawler-ID
2508
     * @return array
2509
     */
2510 2
    protected function sendDirectRequest($url, $crawlerId)
2511
    {
2512 2
        $parsedUrl = parse_url($url);
2513 2
        if (!is_array($parsedUrl)) {
2514
            return [];
2515
        }
2516
2517 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2518
2519 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2520 2
        $cmd .= ' ';
2521 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2522 2
        $cmd .= ' ';
2523 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2524 2
        $cmd .= ' ';
2525 2
        $cmd .= escapeshellarg($url);
2526 2
        $cmd .= ' ';
2527 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2528
2529 2
        $startTime = microtime(true);
2530 2
        $content = $this->executeShellCommand($cmd);
2531 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2532
2533
        $result = [
2534 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2535 2
            'headers' => '',
2536 2
            'content' => $content
2537
        ];
2538
2539 2
        return $result;
2540
    }
2541
2542
    /**
2543
     * Cleans up entries that stayed for too long in the queue. These are:
2544
     * - processed entries that are over 1.5 days in age
2545
     * - scheduled entries that are over 7 days old
2546
     *
2547
     * @return void
2548
     */
2549
    public function cleanUpOldQueueEntries()
2550
    {
2551
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2552
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2553
2554
        $now = time();
2555
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2556
        $this->flushQueue($condition);
2557
    }
2558
2559
    /**
2560
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2561
     *
2562
     * @param int $pageId
2563
     * @return void
2564
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2565
     * @throws \TYPO3\CMS\Core\Http\ImmediateResponseException
2566
     */
2567
    protected function initTSFE(int $pageId): void
2568
    {
2569
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(
2570
            TypoScriptFrontendController::class,
2571
            null,
2572
            $pageId,
2573
            0
2574
        );
2575
        $GLOBALS['TSFE']->initFEuser();
2576
        $GLOBALS['TSFE']->determineId();
2577
        $GLOBALS['TSFE']->getConfigArray();
2578
        $GLOBALS['TSFE']->settingLanguage();
2579
        $GLOBALS['TSFE']->settingLocale();
2580
        $GLOBALS['TSFE']->newCObj();
2581
    }
2582
2583
    /**
2584
     * Returns a md5 hash generated from a serialized configuration array.
2585
     *
2586
     * @param array $configuration
2587
     *
2588
     * @return string
2589
     */
2590 7
    protected function getConfigurationHash(array $configuration)
2591
    {
2592 7
        unset($configuration['paramExpanded']);
2593 7
        unset($configuration['URLs']);
2594 7
        return md5(serialize($configuration));
2595
    }
2596
2597
    /**
2598
     * Check whether the Crawling Protocol should be http or https
2599
     *
2600
     * @param $crawlerConfiguration
2601
     * @param $pageConfiguration
2602
     *
2603
     * @return bool
2604
     */
2605 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration)
2606
    {
2607 6
        switch ($crawlerConfiguration) {
2608
            case -1:
2609 1
                return false;
2610 5
            case 0:
2611 3
                return $pageConfiguration;
2612 2
            case 1:
2613 1
                return true;
2614
            default:
2615 1
                return false;
2616
        }
2617
    }
2618
}
2619