Completed
Push — typo3v9 ( f5e04d...7f7f07 )
by Tomas Norre
13:31 queued 12:06
created

CrawlerController::isCrawlingProtocolHttps()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 4

Importance

Changes 0
Metric Value
cc 4
nc 4
nop 2
dl 0
loc 13
ccs 8
cts 8
cp 1
crap 4
rs 9.8333
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use Psr\Log\LoggerAwareInterface;
35
use Psr\Log\LoggerAwareTrait;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Core\Environment;
40
use TYPO3\CMS\Core\Database\Connection;
41
use TYPO3\CMS\Core\Database\ConnectionPool;
42
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
43
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
44
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
45
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
46
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
47
use TYPO3\CMS\Core\Utility\DebugUtility;
48
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
49
use TYPO3\CMS\Core\Utility\GeneralUtility;
50
use TYPO3\CMS\Core\Utility\MathUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController implements LoggerAwareInterface
61
{
62
    use LoggerAwareTrait;
63
64
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
65
    const CLI_STATUS_REMAIN = 1; //queue not empty
66
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
67
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
68
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
69
70
    /**
71
     * @var integer
72
     */
73
    public $setID = 0;
74
75
    /**
76
     * @var string
77
     */
78
    public $processID = '';
79
80
    /**
81
     * @var array
82
     */
83
    public $duplicateTrack = [];
84
85
    /**
86
     * @var array
87
     */
88
    public $downloadUrls = [];
89
90
    /**
91
     * @var array
92
     */
93
    public $incomingProcInstructions = [];
94
95
    /**
96
     * @var array
97
     */
98
    public $incomingConfigurationSelection = [];
99
100
    /**
101
     * @var bool
102
     */
103
    public $registerQueueEntriesInternallyOnly = false;
104
105
    /**
106
     * @var array
107
     */
108
    public $queueEntries = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $urlList = [];
114
115
    /**
116
     * @var array
117
     */
118
    public $extensionSettings = [];
119
120
    /**
121
     * Mount Point
122
     *
123
     * @var boolean
124
     */
125
    public $MP = false;
126
127
    /**
128
     * @var string
129
     */
130
    protected $processFilename;
131
132
    /**
133
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
134
     *
135
     * @var string
136
     */
137
    protected $accessMode;
138
139
    /**
140
     * @var BackendUserAuthentication
141
     */
142
    private $backendUser;
143
144
    /**
145
     * @var integer
146
     */
147
    private $scheduledTime = 0;
148
149
    /**
150
     * @var integer
151
     */
152
    private $reqMinute = 0;
153
154
    /**
155
     * @var bool
156
     */
157
    private $submitCrawlUrls = false;
158
159
    /**
160
     * @var bool
161
     */
162
    private $downloadCrawlUrls = false;
163
164
    /**
165
     * @var QueueRepository
166
     */
167
    protected $queueRepository;
168
169
    /**
170
     * @var ProcessRepository
171
     */
172
    protected $processRepository;
173
174
    /**
175
     * @var string
176
     */
177
    protected $tableName = 'tx_crawler_queue';
178
179
    /**
180
     * Method to set the accessMode can be gui, cli or cli_im
181
     *
182
     * @return string
183
     */
184 1
    public function getAccessMode()
185
    {
186 1
        return $this->accessMode;
187
    }
188
189
    /**
190
     * @param string $accessMode
191
     */
192 1
    public function setAccessMode($accessMode)
193
    {
194 1
        $this->accessMode = $accessMode;
195 1
    }
196
197
    /**
198
     * Set disabled status to prevent processes from being processed
199
     *
200
     * @param  bool $disabled (optional, defaults to true)
201
     * @return void
202
     */
203 3
    public function setDisabled($disabled = true)
204
    {
205 3
        if ($disabled) {
206 2
            GeneralUtility::writeFile($this->processFilename, '');
207
        } else {
208 1
            if (is_file($this->processFilename)) {
209 1
                unlink($this->processFilename);
210
            }
211
        }
212 3
    }
213
214
    /**
215
     * Get disable status
216
     *
217
     * @return bool true if disabled
218
     */
219 3
    public function getDisabled()
220
    {
221 3
        return is_file($this->processFilename);
222
    }
223
224
    /**
225
     * @param string $filenameWithPath
226
     *
227
     * @return void
228
     */
229 4
    public function setProcessFilename($filenameWithPath)
230
    {
231 4
        $this->processFilename = $filenameWithPath;
232 4
    }
233
234
    /**
235
     * @return string
236
     */
237 1
    public function getProcessFilename()
238
    {
239 1
        return $this->processFilename;
240
    }
241
242
    /************************************
243
     *
244
     * Getting URLs based on Page TSconfig
245
     *
246
     ************************************/
247
248 31
    public function __construct()
249
    {
250 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
251 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
252 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
253
254 31
        $this->backendUser = $GLOBALS['BE_USER'];
255 31
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
256
257
        /** @var ExtensionConfigurationProvider $configurationProvider */
258 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
259 31
        $settings = $configurationProvider->getExtensionConfiguration();
260 31
        $this->extensionSettings = is_array($settings) ? $settings : [];
261
262
        // set defaults:
263 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
264
            $this->extensionSettings['countInARun'] = 100;
265
        }
266
267 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
268 31
    }
269
270
    /**
271
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
272
     *
273
     * @param array $extensionSettings
274
     * @return void
275
     */
276 9
    public function setExtensionSettings(array $extensionSettings)
277
    {
278 9
        $this->extensionSettings = $extensionSettings;
279 9
    }
280
281
    /**
282
     * Check if the given page should be crawled
283
     *
284
     * @param array $pageRow
285
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
286
     */
287 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
288
    {
289 8
        $skipPage = false;
290 8
        $skipMessage = 'Skipped'; // message will be overwritten later
291
292
        // if page is hidden
293 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
294 8
            if ($pageRow['hidden']) {
295 1
                $skipPage = true;
296 1
                $skipMessage = 'Because page is hidden';
297
            }
298
        }
299
300 8
        if (!$skipPage) {
301 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
302 3
                $skipPage = true;
303 3
                $skipMessage = 'Because doktype is not allowed';
304
            }
305
        }
306
307 8
        if (!$skipPage) {
308 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
309 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
310 1
                    $skipPage = true;
311 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
312 1
                    break;
313
                }
314
            }
315
        }
316
317 8
        if (!$skipPage) {
318
            // veto hook
319 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
320
                $params = [
321
                    'pageRow' => $pageRow
322
                ];
323
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
324
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
325
                if ($veto !== false) {
326
                    $skipPage = true;
327
                    if (is_string($veto)) {
328
                        $skipMessage = $veto;
329
                    } else {
330
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
331
                    }
332
                    // no need to execute other hooks if a previous one return a veto
333
                    break;
334
                }
335
            }
336
        }
337
338 8
        return $skipPage ? $skipMessage : false;
339
    }
340
341
    /**
342
     * Wrapper method for getUrlsForPageId()
343
     * It returns an array of configurations and no urls!
344
     *
345
     * @param array $pageRow Page record with at least dok-type and uid columns.
346
     * @param string $skipMessage
347
     * @return array
348
     * @see getUrlsForPageId()
349
     */
350 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
351
    {
352 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
353
354 4
        if ($message === false) {
355 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
356 3
            $skipMessage = '';
357
        } else {
358 1
            $skipMessage = $message;
359 1
            $res = [];
360
        }
361
362 4
        return $res;
363
    }
364
365
    /**
366
     * This method is used to count if there are ANY unprocessed queue entries
367
     * of a given page_id and the configuration which matches a given hash.
368
     * If there if none, we can skip an inner detail check
369
     *
370
     * @param  int $uid
371
     * @param  string $configurationHash
372
     * @return boolean
373
     */
374 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
375
    {
376 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
377 5
        $noUnprocessedQueueEntriesFound = true;
378
379
        $result = $queryBuilder
380 5
            ->count('*')
381 5
            ->from($this->tableName)
382 5
            ->where(
383 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
384 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
385 5
                $queryBuilder->expr()->eq('exec_time', 0)
386
            )
387 5
            ->execute()
388 5
            ->fetchColumn();
389
390 5
        if ($result) {
391 3
            $noUnprocessedQueueEntriesFound = false;
392
        }
393
394 5
        return $noUnprocessedQueueEntriesFound;
395
    }
396
397
    /**
398
     * Creates a list of URLs from input array (and submits them to queue if asked for)
399
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
400
     *
401
     * @param    array        Information about URLs from pageRow to crawl.
402
     * @param    array        Page row
403
     * @param    integer        Unix time to schedule indexing to, typically time()
404
     * @param    integer        Number of requests per minute (creates the interleave between requests)
405
     * @param    boolean        If set, submits the URLs to queue
406
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
407
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
408
     * @param    array        Array which will be filled with URLS for download if flag is set.
409
     * @param    array        Array of processing instructions
410
     * @return    string        List of URLs (meant for display in backend module)
411
     *
412
     */
413 2
    public function urlListFromUrlArray(
414
        array $vv,
415
        array $pageRow,
416
        $scheduledTime,
417
        $reqMinute,
418
        $submitCrawlUrls,
419
        $downloadCrawlUrls,
420
        array &$duplicateTrack,
421
        array &$downloadUrls,
422
        array $incomingProcInstructions
423
    ) {
424 2
        $urlList = '';
425
426 2
        if (is_array($vv['URLs'])) {
427 2
            $configurationHash = $this->getConfigurationHash($vv);
428 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
429
430 2
            foreach ($vv['URLs'] as $urlQuery) {
431 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
432
433
                    // Calculate cHash:
434 2
                    if ($vv['subCfg']['cHash']) {
435
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
436
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
437
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
438
                    }
439
440
                    // Create key by which to determine unique-ness:
441 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
442 2
                    $urlQuery = 'index.php' . $urlQuery;
443
444
                    // Scheduled time:
445 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
446 2
                    $schTime = floor($schTime / 60) * 60;
447
448 2
                    if (isset($duplicateTrack[$uKey])) {
449
450
                        //if the url key is registered just display it and do not resubmit is
451
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
452
                    } else {
453 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
454 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
455
456 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
457
458
                        // Submit for crawling!
459 2
                        if ($submitCrawlUrls) {
460 2
                            $added = $this->addUrl(
461 2
                                $pageRow['uid'],
462 2
                                $theUrl,
463 2
                                $vv['subCfg'],
464 2
                                $scheduledTime,
465 2
                                $configurationHash,
466 2
                                $skipInnerCheck
467
                            );
468 2
                            if ($added === false) {
469 2
                                $urlList .= ' (Url already existed)';
470
                            }
471
                        } elseif ($downloadCrawlUrls) {
472
                            $downloadUrls[$theUrl] = $theUrl;
473
                        }
474
475 2
                        $urlList .= '<br />';
476
                    }
477 2
                    $duplicateTrack[$uKey] = true;
478
                }
479
            }
480
        } else {
481
            $urlList = 'ERROR - no URL generated';
482
        }
483
484 2
        return $urlList;
485
    }
486
487
    /**
488
     * Returns true if input processing instruction is among registered ones.
489
     *
490
     * @param string $piString PI to test
491
     * @param array $incomingProcInstructions Processing instructions
492
     * @return boolean
493
     */
494 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
495
    {
496 5
        if (empty($incomingProcInstructions)) {
497 1
            return true;
498
        }
499
500 4
        foreach ($incomingProcInstructions as $pi) {
501 4
            if (GeneralUtility::inList($piString, $pi)) {
502 2
                return true;
503
            }
504
        }
505 2
        return false;
506
    }
507
508 2
    public function getPageTSconfigForId($id)
509
    {
510 2
        if (!$this->MP) {
511 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
512
        } else {
513
            list(, $mountPointId) = explode('-', $this->MP);
514
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
515
        }
516
517
        // Call a hook to alter configuration
518 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
519
            $params = [
520
                'pageId' => $id,
521
                'pageTSConfig' => &$pageTSconfig
522
            ];
523
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
524
                GeneralUtility::callUserFunction($userFunc, $params, $this);
525
            }
526
        }
527
528 2
        return $pageTSconfig;
529
    }
530
531
    /**
532
     * This methods returns an array of configurations.
533
     * And no urls!
534
     *
535
     * @param integer $id Page ID
536
     * @return array
537
     */
538 2
    public function getUrlsForPageId($id)
539
    {
540
541
        /**
542
         * Get configuration from tsConfig
543
         */
544
545
        // Get page TSconfig for page ID:
546 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
547
548 2
        $res = [];
549
550 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
551 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
552
553 1
            if (is_array($crawlerCfg['paramSets.'])) {
554 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
555 1
                    if (is_array($values)) {
556 1
                        $key = str_replace('.', '', $key);
557
                        // Sub configuration for a single configuration string:
558 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
559 1
                        $subCfg['key'] = $key;
560
561 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
562 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
                        }
564 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
                        // process configuration if it is not page-specific or if the specific page is the current page:
567 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
568
569
                                // add trailing slash if not present
570 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
571
                                $subCfg['baseUrl'] .= '/';
572
                            }
573
574
                            // Explode, process etc.:
575 1
                            $res[$key] = [];
576 1
                            $res[$key]['subCfg'] = $subCfg;
577 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
578 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
579 1
                            $res[$key]['origin'] = 'pagets';
580
581
                            // recognize MP value
582 1
                            if (!$this->MP) {
583 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
584
                            } else {
585
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
586
                            }
587
                        }
588
                    }
589
                }
590
            }
591
        }
592
593
        /**
594
         * Get configuration from tx_crawler_configuration records
595
         */
596
597
        // get records along the rootline
598 2
        $rootLine = BackendUtility::BEgetRootLine($id);
599
600 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
601
        $queryBuilder
602 2
            ->getRestrictions()->removeAll()
603 2
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
604 2
            ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
605
606 2
        foreach ($rootLine as $page) {
607
            $configurationRecordsForCurrentPage = $queryBuilder
608 2
                ->select('*')
609 2
                ->from('tx_crawler_configuration')
610 2
                ->where(
611 2
                    $queryBuilder->expr()->eq('pid', $page['uid'])
612
                )
613 2
                ->execute()
614 2
                ->fetchAll();
615
616 2
            foreach ($configurationRecordsForCurrentPage ?? [] as $configurationRecord) {
617
618
                    // check access to the configuration record
619 1
                if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
620 1
                    $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
621
622
                    // process configuration if it is not page-specific or if the specific page is the current page:
623 1
                    if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
624 1
                        $key = $configurationRecord['name'];
625
626
                        // don't overwrite previously defined paramSets
627 1
                        if (!isset($res[$key])) {
628
629
                                /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
630 1
                            $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
631 1
                            $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
632
633
                            $subCfg = [
634 1
                                'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
635 1
                                'procInstrParams.' => $TSparserObject->setup,
636 1
                                'baseUrl' => $this->getBaseUrlForConfigurationRecord(
637 1
                                    $configurationRecord['base_url'],
638 1
                                    (int)$configurationRecord['sys_domain_base_url'],
639 1
                                    (bool)($configurationRecord['force_ssl'] > 0)
640
                                ),
641 1
                                'cHash' => $configurationRecord['chash'],
642 1
                                'userGroups' => $configurationRecord['fegroups'],
643 1
                                'exclude' => $configurationRecord['exclude'],
644 1
                                'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
645 1
                                'key' => $key
646
                            ];
647
648
                            // add trailing slash if not present
649 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
650
                                $subCfg['baseUrl'] .= '/';
651
                            }
652 1
                            if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
653 1
                                $res[$key] = [];
654 1
                                $res[$key]['subCfg'] = $subCfg;
655 1
                                $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
656 1
                                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
657 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
658 1
                                $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
659
                            }
660
                        }
661
                    }
662
                }
663
            }
664
        }
665
666 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
667
            $params = [
668
                'res' => &$res,
669
            ];
670
            GeneralUtility::callUserFunction($func, $params, $this);
671
        }
672 2
        return $res;
673
    }
674
675
    /**
676
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
677
     *
678
     * @param string $baseUrl
679
     * @param integer $sysDomainUid
680
     * @param bool $ssl
681
     * @return string
682
     */
683 4
    protected function getBaseUrlForConfigurationRecord(string $baseUrl, int $sysDomainUid, bool $ssl = false): string
684
    {
685 4
        if ($sysDomainUid > 0) {
686 2
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('sys_domain');
687
            $domainName = $queryBuilder
688 2
                ->select('domainName')
689 2
                ->from('sys_domain')
690 2
                ->where(
691 2
                    $queryBuilder->expr()->eq('uid', $sysDomainUid)
692
                )
693 2
                ->execute()
694 2
                ->fetchColumn();
695
696 2
            if (!empty($domainName)) {
697 1
                $baseUrl = ($ssl ? 'https' : 'http') . '://' . $domainName;
698
            }
699
        }
700 4
        return $baseUrl;
701
    }
702
703
    /**
704
     * @param $rootid
705
     * @param $depth
706
     * @return array
707
     *
708
     * TODO: Write Functional Tests
709
     */
710
    public function getConfigurationsForBranch($rootid, $depth)
711
    {
712
        $configurationsForBranch = [];
713
714
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
715
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
716
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
717
            if (is_array($sets)) {
718
                foreach ($sets as $key => $value) {
719
                    if (!is_array($value)) {
720
                        continue;
721
                    }
722
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
723
                }
724
            }
725
        }
726
        $pids = [];
727
        $rootLine = BackendUtility::BEgetRootLine($rootid);
728
        foreach ($rootLine as $node) {
729
            $pids[] = $node['uid'];
730
        }
731
        /* @var PageTreeView $tree */
732
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
733
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
734
        $tree->init('AND ' . $perms_clause);
735
        $tree->getTree($rootid, $depth, '');
736
        foreach ($tree->tree as $node) {
737
            $pids[] = $node['row']['uid'];
738
        }
739
740
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
741
742
        $queryBuilder->getRestrictions()
743
            ->removeAll()
744
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
745
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
746
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
747
748
        $statement = $queryBuilder
749
            ->select('name')
750
            ->from('tx_crawler_configuration')
751
            ->where(
752
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
753
            )
754
        ->execute();
755
756
        while ($row = $statement->fetch()) {
757
            $configurationsForBranch[] = $row['name'];
758
        }
759
760
        return $configurationsForBranch;
761
    }
762
763
    /**
764
     * Get querybuilder for given table
765
     *
766
     * @param string $table
767
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
768
     */
769 9
    private function getQueryBuilder(string $table)
770
    {
771 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
772 9
            ->getConnectionForTable($table)
773 9
            ->createQueryBuilder();
774
    }
775
776
    /**
777
     * Check if a user has access to an item
778
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
779
     *
780
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
781
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
782
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
783
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
784
     */
785 3
    public function hasGroupAccess($groupList, $accessList)
786
    {
787 3
        if (empty($accessList)) {
788 1
            return true;
789
        }
790 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
791 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
792 1
                return true;
793
            }
794
        }
795 1
        return false;
796
    }
797
798
    /**
799
     * Parse GET vars of input Query into array with key=>value pairs
800
     *
801
     * @param string $inputQuery Input query string
802
     * @return array
803
     */
804 5
    public function parseParams($inputQuery)
805
    {
806
        //echo '<pre>', var_dump($inputQuery), '</pre>';
807
        // Extract all GET parameters into an ARRAY:
808 5
        $paramKeyValues = [];
809 5
        $GETparams = explode('&', $inputQuery);
810
811 5
        foreach ($GETparams as $paramAndValue) {
812 5
            list($p, $v) = explode('=', $paramAndValue, 2);
813 5
            if (strlen($p)) {
814 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
815
            }
816
        }
817
818 5
        return $paramKeyValues;
819
    }
820
821
    /**
822
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
823
     * Syntax of values:
824
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
825
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
826
     * - For each configuration part:
827
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
828
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
829
     *        _ENABLELANG:1 picks only original records without their language overlays
830
     *         - Default: Literal value
831
     *
832
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
833
     * @param integer $pid Current page ID
834
     * @return array
835
     *
836
     * TODO: Write Functional Tests
837
     */
838 2
    public function expandParameters($paramArray, $pid)
839
    {
840
        // Traverse parameter names:
841 2
        foreach ($paramArray as $p => $v) {
842 2
            $v = trim($v);
843
844
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
845 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
846
                // So, find the value inside brackets and reset the paramArray value as an array.
847 2
                $v = substr($v, 1, -1);
848 2
                $paramArray[$p] = [];
849
850
                // Explode parts and traverse them:
851 2
                $parts = explode('|', $v);
852 2
                foreach ($parts as $pV) {
853
854
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
855 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
856
857
                        // Swap if first is larger than last:
858
                        if ($reg[1] > $reg[2]) {
859
                            $temp = $reg[2];
860
                            $reg[2] = $reg[1];
861
                            $reg[1] = $temp;
862
                        }
863
864
                        // Traverse range, add values:
865
                        $runAwayBrake = 1000; // Limit to size of range!
866
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
867
                            $paramArray[$p][] = $a;
868
                            $runAwayBrake--;
869
                            if ($runAwayBrake <= 0) {
870
                                break;
871
                            }
872
                        }
873 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
874
875
                        // Parse parameters:
876
                        $subparts = GeneralUtility::trimExplode(';', $pV);
877
                        $subpartParams = [];
878
                        foreach ($subparts as $spV) {
879
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
880
                            $subpartParams[$pKey] = $pVal;
881
                        }
882
883
                        // Table exists:
884
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
885
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
886
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
887
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
888
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
889
890
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
891
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
892
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
893
894
                                $queryBuilder->getRestrictions()
895
                                    ->removeAll()
896
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
897
898
                                $queryBuilder
899
                                    ->select($fieldName)
900
                                    ->from($subpartParams['_TABLE'])
901
                                    // TODO: Check if this works as intended!
902
                                    ->add('from', $addTable)
903
                                    ->where(
904
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
905
                                        $where
906
                                    );
907
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
908
909
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
910
                                    $queryBuilder->andWhere(
911
                                        $queryBuilder->expr()->lte(
912
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
913
                                            0
914
                                        )
915
                                    );
916
                                }
917
918
                                $statement = $queryBuilder->execute();
919
920
                                $rows = [];
921
                                while ($row = $statement->fetch()) {
922
                                    $rows[$fieldName] = $row;
923
                                }
924
925
                                if (is_array($rows)) {
926
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
927
                                }
928
                            }
929
                        }
930
                    } else { // Just add value:
931 2
                        $paramArray[$p][] = $pV;
932
                    }
933
                    // Hook for processing own expandParameters place holder
934 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
935
                        $_params = [
936
                            'pObj' => &$this,
937
                            'paramArray' => &$paramArray,
938
                            'currentKey' => $p,
939
                            'currentValue' => $pV,
940
                            'pid' => $pid
941
                        ];
942
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
943
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
944
                        }
945
                    }
946
                }
947
948
                // Make unique set of values and sort array by key:
949 2
                $paramArray[$p] = array_unique($paramArray[$p]);
950 2
                ksort($paramArray);
951
            } else {
952
                // Set the literal value as only value in array:
953 2
                $paramArray[$p] = [$v];
954
            }
955
        }
956
957 2
        return $paramArray;
958
    }
959
960
    /**
961
     * Compiling URLs from parameter array (output of expandParameters())
962
     * The number of URLs will be the multiplication of the number of parameter values for each key
963
     *
964
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
965
     * @param array $urls URLs accumulated in this array (for recursion)
966
     * @return array
967
     */
968 5
    public function compileUrls($paramArray, $urls = [])
969
    {
970 5
        if (!empty($paramArray) && is_array($urls)) {
971
            // shift first off stack:
972 4
            reset($paramArray);
973 4
            $varName = key($paramArray);
974 4
            $valueSet = array_shift($paramArray);
975
976
            // Traverse value set:
977 4
            $newUrls = [];
978 4
            $maxCompileUrls = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
979 4
            foreach ($urls as $url) {
980 3
                foreach ($valueSet as $val) {
981 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
982
983 3
                    if (count($newUrls) > $maxCompileUrls) {
984
                        break;
985
                    }
986
                }
987
            }
988 4
            $urls = $newUrls;
989 4
            $urls = $this->compileUrls($paramArray, $urls);
990
        }
991
992 5
        return $urls;
993
    }
994
995
    /************************************
996
     *
997
     * Crawler log
998
     *
999
     ************************************/
1000
1001
    /**
1002
     * Return array of records from crawler queue for input page ID
1003
     *
1004
     * @param integer $id Page ID for which to look up log entries.
1005
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1006
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1007
     * @param boolean $doFullFlush
1008
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1009
     * @return array
1010
     */
1011 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1012
    {
1013 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1014
        $queryBuilder
1015 4
            ->select('*')
1016 4
            ->from($this->tableName)
1017 4
            ->where(
1018 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1019
            )
1020 4
            ->orderBy('scheduled', 'DESC');
1021
1022 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1023 4
            ->getConnectionForTable($this->tableName)
1024 4
            ->getExpressionBuilder();
1025 4
        $query = $expressionBuilder->andX();
1026
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1027
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1028
        // between the statements, it's not a mistake in the code.
1029 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1030 4
        switch ($filter) {
1031 4
            case 'pending':
1032
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1033
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1034
                break;
1035 4
            case 'finished':
1036
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1037
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1038
                break;
1039
        }
1040
1041
        // FIXME: Write unit test that ensures that the right records are deleted.
1042 4
        if ($doFlush) {
1043 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1044 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1045 2
            return [];
1046
        } else {
1047 2
            if ($itemsPerPage > 0) {
1048
                $queryBuilder
1049 2
                    ->setMaxResults((int)$itemsPerPage);
1050
            }
1051
1052 2
            return $queryBuilder->execute()->fetchAll();
1053
        }
1054
    }
1055
1056
    /**
1057
     * Return array of records from crawler queue for input set ID
1058
     *
1059
     * @param integer $set_id Set ID for which to look up log entries.
1060
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1061
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1062
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1063
     * @return array
1064
     */
1065 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1066
    {
1067 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1068
        $queryBuilder
1069 6
            ->select('*')
1070 6
            ->from($this->tableName)
1071 6
            ->where(
1072 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1073
            )
1074 6
            ->orderBy('scheduled', 'DESC');
1075
1076 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1077 6
            ->getConnectionForTable($this->tableName)
1078 6
            ->getExpressionBuilder();
1079 6
        $query = $expressionBuilder->andX();
1080
        // FIXME: Write Unit tests for Filters
1081
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1082
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1083
        // between the statements, it's not a mistake in the code.
1084 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1085 6
        switch ($filter) {
1086 6
            case 'pending':
1087 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1088 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1089 1
                break;
1090 5
            case 'finished':
1091 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1092 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1093 1
                break;
1094
        }
1095
        // FIXME: Write unit test that ensures that the right records are deleted.
1096 6
        if ($doFlush) {
1097 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1098 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1099 4
            return [];
1100
        } else {
1101 2
            if ($itemsPerPage > 0) {
1102
                $queryBuilder
1103 2
                    ->setMaxResults((int)$itemsPerPage);
1104
            }
1105
1106 2
            return $queryBuilder->execute()->fetchAll();
1107
        }
1108
    }
1109
1110
    /**
1111
     * Removes queue entries
1112
     *
1113
     * @param string $where SQL related filter for the entries which should be removed
1114
     * @return void
1115
     */
1116 9
    protected function flushQueue($where = '')
1117
    {
1118 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1119
1120 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1121
1122 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1123
            $groups = $queryBuilder
1124
                ->select('DISTINCT set_id')
1125
                ->from($this->tableName)
1126
                ->where($realWhere)
1127
                ->execute()
1128
                ->fetchAll();
1129
            if (is_array($groups)) {
1130
                foreach ($groups as $group) {
1131
                    $subSet = $queryBuilder
1132
                        ->select('uid', 'set_id')
1133
                        ->from($this->tableName)
1134
                        ->where(
1135
                            $realWhere,
1136
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1137
                        )
1138
                        ->execute()
1139
                        ->fetchAll();
1140
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1141
                }
1142
            }
1143
        }
1144
1145
        $queryBuilder
1146 9
            ->delete($this->tableName)
1147 9
            ->where($realWhere)
1148 9
            ->execute();
1149 9
    }
1150
1151
    /**
1152
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1153
     *
1154
     * @param integer $setId Set ID
1155
     * @param array $params Parameters to pass to call back function
1156
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1157
     * @param integer $page_id Page ID to attach it to
1158
     * @param integer $schedule Time at which to activate
1159
     * @return void
1160
     */
1161
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1162
    {
1163
        if (!is_array($params)) {
1164
            $params = [];
1165
        }
1166
        $params['_CALLBACKOBJ'] = $callBack;
1167
1168
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1169
            ->insert(
1170
                'tx_crawler_queue',
1171
                [
1172
                    'page_id' => intval($page_id),
1173
                    'parameters' => serialize($params),
1174
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1175
                    'exec_time' => 0,
1176
                    'set_id' => intval($setId),
1177
                    'result_data' => '',
1178
                ]
1179
            );
1180
    }
1181
1182
    /************************************
1183
     *
1184
     * URL setting
1185
     *
1186
     ************************************/
1187
1188
    /**
1189
     * Setting a URL for crawling:
1190
     *
1191
     * @param integer $id Page ID
1192
     * @param string $url Complete URL
1193
     * @param array $subCfg Sub configuration array (from TS config)
1194
     * @param integer $tstamp Scheduled-time
1195
     * @param string $configurationHash (optional) configuration hash
1196
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1197
     * @return bool
1198
     */
1199 2
    public function addUrl(
1200
        $id,
1201
        $url,
1202
        array $subCfg,
1203
        $tstamp,
1204
        $configurationHash = '',
1205
        $skipInnerDuplicationCheck = false
1206
    ) {
1207 2
        $urlAdded = false;
1208 2
        $rows = [];
1209
1210
        // Creating parameters:
1211
        $parameters = [
1212 2
            'url' => $url
1213
        ];
1214
1215
        // fe user group simulation:
1216 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1217 2
        if ($uGs) {
1218
            $parameters['feUserGroupList'] = $uGs;
1219
        }
1220
1221
        // Setting processing instructions
1222 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1223 2
        if (is_array($subCfg['procInstrParams.'])) {
1224 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1225
        }
1226
1227
        // Possible TypoScript Template Parents
1228 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1229
1230
        // Compile value array:
1231 2
        $parameters_serialized = serialize($parameters);
1232
        $fieldArray = [
1233 2
            'page_id' => intval($id),
1234 2
            'parameters' => $parameters_serialized,
1235 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1236 2
            'configuration_hash' => $configurationHash,
1237 2
            'scheduled' => $tstamp,
1238 2
            'exec_time' => 0,
1239 2
            'set_id' => intval($this->setID),
1240 2
            'result_data' => '',
1241 2
            'configuration' => $subCfg['key'],
1242
        ];
1243
1244 2
        if ($this->registerQueueEntriesInternallyOnly) {
1245
            //the entries will only be registered and not stored to the database
1246
            $this->queueEntries[] = $fieldArray;
1247
        } else {
1248 2
            if (!$skipInnerDuplicationCheck) {
1249
                // check if there is already an equal entry
1250 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1251
            }
1252
1253 2
            if (empty($rows)) {
1254 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1255 2
                $connectionForCrawlerQueue->insert(
1256 2
                    'tx_crawler_queue',
1257 2
                    $fieldArray
1258
                );
1259 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1260 2
                $rows[] = $uid;
1261 2
                $urlAdded = true;
1262 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1263
            } else {
1264
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1265
            }
1266
        }
1267
1268 2
        return $urlAdded;
1269
    }
1270
1271
    /**
1272
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1273
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1274
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1275
     *
1276
     * @param int $tstamp
1277
     * @param array $fieldArray
1278
     *
1279
     * @return array
1280
     *
1281
     * TODO: Write Functional Tests
1282
     */
1283 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1284
    {
1285 2
        $rows = [];
1286
1287 2
        $currentTime = $this->getCurrentTime();
1288
1289 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1290
        $queryBuilder
1291 2
            ->select('qid')
1292 2
            ->from('tx_crawler_queue');
1293
        //if this entry is scheduled with "now"
1294 2
        if ($tstamp <= $currentTime) {
1295
            if ($this->extensionSettings['enableTimeslot']) {
1296
                $timeBegin = $currentTime - 100;
1297
                $timeEnd = $currentTime + 100;
1298
                $queryBuilder
1299
                    ->where(
1300
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1301
                    )
1302
                    ->orWhere(
1303
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1304
                    );
1305
            } else {
1306
                $queryBuilder
1307
                    ->where(
1308
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1309
                    );
1310
            }
1311 2
        } elseif ($tstamp > $currentTime) {
1312
            //entry with a timestamp in the future need to have the same schedule time
1313
            $queryBuilder
1314 2
                ->where(
1315 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1316
                );
1317
        }
1318
1319
        $statement = $queryBuilder
1320 2
            ->andWhere('exec_time != 0')
1321 2
            ->andWhere('process_id != 0')
1322 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1323 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1324 2
            ->execute();
1325
1326 2
        while ($row = $statement->fetch()) {
1327
            $rows[] = $row['qid'];
1328
        }
1329
1330 2
        return $rows;
1331
    }
1332
1333
    /**
1334
     * Returns the current system time
1335
     *
1336
     * @return int
1337
     */
1338
    public function getCurrentTime()
1339
    {
1340
        return time();
1341
    }
1342
1343
    /************************************
1344
     *
1345
     * URL reading
1346
     *
1347
     ************************************/
1348
1349
    /**
1350
     * Read URL for single queue entry
1351
     *
1352
     * @param integer $queueId
1353
     * @param boolean $force If set, will process even if exec_time has been set!
1354
     * @return integer
1355
     */
1356
    public function readUrl($queueId, $force = false)
1357
    {
1358
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1359
        $ret = 0;
1360
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1361
        // Get entry:
1362
        $queryBuilder
1363
            ->select('*')
1364
            ->from('tx_crawler_queue')
1365
            ->where(
1366
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1367
            );
1368
        if (!$force) {
1369
            $queryBuilder
1370
                ->andWhere('exec_time = 0')
1371
                ->andWhere('process_scheduled > 0');
1372
        }
1373
        $queueRec = $queryBuilder->execute()->fetch();
1374
1375
        if (!is_array($queueRec)) {
1376
            return;
1377
        }
1378
1379
        $parameters = unserialize($queueRec['parameters']);
1380
        if ($parameters['rootTemplatePid']) {
1381
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1382
        } else {
1383
            $this->logger->warning(
1384
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1385
            );
1386
        }
1387
1388
        SignalSlotUtility::emitSignal(
1389
            __CLASS__,
1390
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1391
            [$queueId, &$queueRec]
1392
        );
1393
1394
        // Set exec_time to lock record:
1395
        $field_array = ['exec_time' => $this->getCurrentTime()];
1396
1397
        if (isset($this->processID)) {
1398
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1399
            $field_array['process_id_completed'] = $this->processID;
1400
        }
1401
1402
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1403
            ->update(
1404
                'tx_crawler_queue',
1405
                $field_array,
1406
                [ 'qid' => (int)$queueId ]
1407
            );
1408
1409
        $result = $this->readUrl_exec($queueRec);
1410
        $resultData = unserialize($result['content']);
1411
1412
        //atm there's no need to point to specific pollable extensions
1413
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1414
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1415
                // only check the success value if the instruction is runnig
1416
                // it is important to name the pollSuccess key same as the procInstructions key
1417
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1418
                    $pollable,
1419
                    $resultData['parameters']['procInstructions']
1420
                )
1421
                ) {
1422
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1423
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1424
                    }
1425
                }
1426
            }
1427
        }
1428
1429
        // Set result in log which also denotes the end of the processing of this entry.
1430
        $field_array = ['result_data' => serialize($result)];
1431
1432
        SignalSlotUtility::emitSignal(
1433
            __CLASS__,
1434
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1435
            [$queueId, &$field_array]
1436
        );
1437
1438
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1439
            ->update(
1440
                'tx_crawler_queue',
1441
                $field_array,
1442
                [ 'qid' => (int)$queueId ]
1443
            );
1444
1445
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1446
        return $ret;
1447
    }
1448
1449
    /**
1450
     * Read URL for not-yet-inserted log-entry
1451
     *
1452
     * @param array $field_array Queue field array,
1453
     *
1454
     * @return string
1455
     */
1456
    public function readUrlFromArray($field_array)
1457
    {
1458
1459
            // Set exec_time to lock record:
1460
        $field_array['exec_time'] = $this->getCurrentTime();
1461
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1462
        $connectionForCrawlerQueue->insert(
1463
            'tx_crawler_queue',
1464
            $field_array
1465
        );
1466
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1467
1468
        $result = $this->readUrl_exec($field_array);
1469
1470
        // Set result in log which also denotes the end of the processing of this entry.
1471
        $field_array = ['result_data' => serialize($result)];
1472
1473
        SignalSlotUtility::emitSignal(
1474
            __CLASS__,
1475
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1476
            [$queueId, &$field_array]
1477
        );
1478
1479
        $connectionForCrawlerQueue->update(
1480
            'tx_crawler_queue',
1481
            $field_array,
1482
            ['qid' => $queueId]
1483
        );
1484
1485
        return $result;
1486
    }
1487
1488
    /**
1489
     * Read URL for a queue record
1490
     *
1491
     * @param array $queueRec Queue record
1492
     * @return string
1493
     */
1494
    public function readUrl_exec($queueRec)
1495
    {
1496
        // Decode parameters:
1497
        $parameters = unserialize($queueRec['parameters']);
1498
        $result = 'ERROR';
1499
        if (is_array($parameters)) {
1500
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1501
                $objRef = $parameters['_CALLBACKOBJ'];
1502
                $callBackObj = GeneralUtility::makeInstance($objRef);
1503
                if (is_object($callBackObj)) {
1504
                    unset($parameters['_CALLBACKOBJ']);
1505
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1506
                } else {
1507
                    $result = ['content' => 'No object: ' . $objRef];
1508
                }
1509
            } else { // Regular FE request:
1510
1511
                // Prepare:
1512
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1513
1514
                // Get result:
1515
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1516
1517
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1518
            }
1519
        }
1520
1521
        return $result;
1522
    }
1523
1524
    /**
1525
     * Gets the content of a URL.
1526
     *
1527
     * @param string $originalUrl URL to read
1528
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1529
     * @param integer $timeout Timeout time
1530
     * @param integer $recursion Recursion limiter for 302 redirects
1531
     * @return array|boolean
1532
     */
1533 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1534
    {
1535 2
        if (!$recursion) {
1536
            return false;
1537
        }
1538
1539
        // Parse URL, checking for scheme:
1540 2
        $url = parse_url($originalUrl);
1541
1542 2
        if ($url === false) {
1543
            $this->logger->debug(
1544
                sprintf('Could not parse_url() for string "%s"', $url),
1545
                ['crawlerId' => $crawlerId]
1546
            );
1547
            return false;
1548
        }
1549
1550 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1551
            $this->logger->debug(
1552
                sprintf('Scheme does not match for url "%s"', $url),
1553
                ['crawlerId' => $crawlerId]
1554
            );
1555
            return false;
1556
        }
1557
1558
        // direct request
1559 2
        if ($this->extensionSettings['makeDirectRequests']) {
1560 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1561 2
            return $result;
1562
        }
1563
1564
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1565
1566
        // thanks to Pierrick Caillon for adding proxy support
1567
        $rurl = $url;
1568
1569
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1570
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1571
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1572
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1573
        }
1574
1575
        $host = $rurl['host'];
1576
1577
        if ($url['scheme'] == 'https') {
1578
            $host = 'ssl://' . $host;
1579
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1580
        } else {
1581
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1582
        }
1583
1584
        $startTime = microtime(true);
1585
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1586
1587
        if (!$fp) {
1588
            $this->logger->debug(
1589
                sprintf('Error while opening "%s"', $url),
1590
                ['crawlerId' => $crawlerId]
1591
            );
1592
            return false;
1593
        } else {
1594
            // Request message:
1595
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1596
            fputs($fp, $msg);
1597
1598
            // Read response:
1599
            $d = $this->getHttpResponseFromStream($fp);
1600
            fclose($fp);
1601
1602
            $time = microtime(true) - $startTime;
1603
            $this->log($originalUrl . ' ' . $time);
1604
1605
            // Implode content and headers:
1606
            $result = [
1607
                'request' => $msg,
1608
                'headers' => implode('', $d['headers']),
1609
                'content' => implode('', (array)$d['content'])
1610
            ];
1611
1612
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1613
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1614
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1615
1616
                if (is_array($newRequestUrl)) {
1617
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1618
                } else {
1619
                    $this->logger->debug(
1620
                        sprintf('Error while opening "%s"', $url),
1621
                        ['crawlerId' => $crawlerId]
1622
                    );
1623
                    return false;
1624
                }
1625
            }
1626
1627
            return $result;
1628
        }
1629
    }
1630
1631
    /**
1632
     * Gets the base path of the website frontend.
1633
     * (e.g. if you call http://mydomain.com/cms/index.php in
1634
     * the browser the base path is "/cms/")
1635
     *
1636
     * @return string Base path of the website frontend
1637
     */
1638
    protected function getFrontendBasePath()
1639
    {
1640
        $frontendBasePath = '/';
1641
1642
        // Get the path from the extension settings:
1643
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1644
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1645
        // If empty, try to use config.absRefPrefix:
1646
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1647
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1648
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1649
        } elseif (!Environment::isCli()) {
1650
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1651
        }
1652
1653
        // Base path must be '/<pathSegements>/':
1654
        if ($frontendBasePath !== '/') {
1655
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1656
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1657
        }
1658
1659
        return $frontendBasePath;
1660
    }
1661
1662
    /**
1663
     * Executes a shell command and returns the outputted result.
1664
     *
1665
     * @param string $command Shell command to be executed
1666
     * @return string Outputted result of the command execution
1667
     */
1668
    protected function executeShellCommand($command)
1669
    {
1670
        return shell_exec($command);
1671
    }
1672
1673
    /**
1674
     * Reads HTTP response from the given stream.
1675
     *
1676
     * @param  resource $streamPointer  Pointer to connection stream.
1677
     * @return array                    Associative array with the following items:
1678
     *                                  headers <array> Response headers sent by server.
1679
     *                                  content <array> Content, with each line as an array item.
1680
     */
1681 1
    protected function getHttpResponseFromStream($streamPointer)
1682
    {
1683 1
        $response = ['headers' => [], 'content' => []];
1684
1685 1
        if (is_resource($streamPointer)) {
1686
            // read headers
1687 1
            while ($line = fgets($streamPointer, '2048')) {
1688 1
                $line = trim($line);
1689 1
                if ($line !== '') {
1690 1
                    $response['headers'][] = $line;
1691
                } else {
1692 1
                    break;
1693
                }
1694
            }
1695
1696
            // read content
1697 1
            while ($line = fgets($streamPointer, '2048')) {
1698 1
                $response['content'][] = $line;
1699
            }
1700
        }
1701
1702 1
        return $response;
1703
    }
1704
1705
    /**
1706
     * In the future this setting "logFileName" should be removed in favor of using the TYPO3 Logging Framework
1707
     * @param string the message string to log
1708
     */
1709 2
    protected function log(string $message): void
1710
    {
1711 2
        if (!empty($this->extensionSettings['logFileName'])) {
1712
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1713
        }
1714 2
        $this->logger->info(
1715 2
            sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1716
        );
1717 2
    }
1718
1719
    /**
1720
     * Builds HTTP request headers.
1721
     *
1722
     * @param array $url
1723
     * @param string $crawlerId
1724
     *
1725
     * @return array
1726
     */
1727 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1728
    {
1729 6
        $reqHeaders = [];
1730 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1731 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1732 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1733 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1734
        }
1735 6
        $reqHeaders[] = 'Connection: close';
1736 6
        if ($url['user'] != '') {
1737 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1738
        }
1739 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1740 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1741 6
        return $reqHeaders;
1742
    }
1743
1744
    /**
1745
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1746
     *
1747
     * @param array $headers HTTP Header
1748
     * @param string $user HTTP Auth. User
1749
     * @param string $pass HTTP Auth. Password
1750
     * @return bool|string
1751
     */
1752 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1753
    {
1754 12
        $header = [];
1755 12
        if (!is_array($headers)) {
1756 1
            return false;
1757
        }
1758 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1759 2
            return false;
1760
        }
1761
1762 9
        foreach ($headers as $hl) {
1763 9
            $tmp = explode(": ", $hl);
1764 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1765 9
            if (trim($tmp[0]) == 'Location') {
1766 6
                break;
1767
            }
1768
        }
1769 9
        if (!array_key_exists('Location', $header)) {
1770 3
            return false;
1771
        }
1772
1773 6
        if ($user != '') {
1774 3
            if (!($tmp = parse_url($header['Location']))) {
1775 1
                return false;
1776
            }
1777 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1778 2
            if ($tmp['query'] != '') {
1779 2
                $newUrl .= '?' . $tmp['query'];
1780
            }
1781
        } else {
1782 3
            $newUrl = $header['Location'];
1783
        }
1784 5
        return $newUrl;
1785
    }
1786
1787
    /**************************
1788
     *
1789
     * tslib_fe hooks:
1790
     *
1791
     **************************/
1792
1793
    /**
1794
     * Initialization hook (called after database connection)
1795
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1796
     *
1797
     * @param array $params Parameters from frontend
1798
     * @param object $ref TSFE object (reference under PHP5)
1799
     * @return void
1800
     *
1801
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1802
     * FIXME: I think this can be removed. (TNM)
1803
     */
1804
    public function fe_init(&$params, $ref)
1805
    {
1806
        // Authenticate crawler request:
1807
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1808
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1809
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1810
1811
            $queueRec = $queryBuilder
1812
                ->select('*')
1813
                ->from('tx_crawler_queue')
1814
                ->where(
1815
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1816
                )
1817
                ->execute()
1818
                ->fetch();
1819
1820
            // If a crawler record was found and hash was matching, set it up:
1821
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1822
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1823
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1824
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1825
            } else {
1826
                die('No crawler entry found!');
1827
            }
1828
        }
1829
    }
1830
1831
    /*****************************
1832
     *
1833
     * Compiling URLs to crawl - tools
1834
     *
1835
     *****************************/
1836
1837
    /**
1838
     * @param integer $id Root page id to start from.
1839
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1840
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1841
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1842
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1843
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1844
     * @param array $incomingProcInstructions Array of processing instructions
1845
     * @param array $configurationSelection Array of configuration keys
1846
     * @return string
1847
     */
1848
    public function getPageTreeAndUrls(
1849
        $id,
1850
        $depth,
1851
        $scheduledTime,
1852
        $reqMinute,
1853
        $submitCrawlUrls,
1854
        $downloadCrawlUrls,
1855
        array $incomingProcInstructions,
1856
        array $configurationSelection
1857
    ) {
1858
        $this->scheduledTime = $scheduledTime;
1859
        $this->reqMinute = $reqMinute;
1860
        $this->submitCrawlUrls = $submitCrawlUrls;
1861
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1862
        $this->incomingProcInstructions = $incomingProcInstructions;
1863
        $this->incomingConfigurationSelection = $configurationSelection;
1864
1865
        $this->duplicateTrack = [];
1866
        $this->downloadUrls = [];
1867
1868
        // Drawing tree:
1869
        /* @var PageTreeView $tree */
1870
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1871
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1872
        $tree->init('AND ' . $perms_clause);
1873
1874
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1875
        if (is_array($pageInfo)) {
1876
            // Set root row:
1877
            $tree->tree[] = [
1878
                'row' => $pageInfo,
1879
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1880
            ];
1881
        }
1882
1883
        // Get branch beneath:
1884
        if ($depth) {
1885
            $tree->getTree($id, $depth, '');
1886
        }
1887
1888
        // Traverse page tree:
1889
        $code = '';
1890
1891
        foreach ($tree->tree as $data) {
1892
            $this->MP = false;
1893
1894
            // recognize mount points
1895
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1896
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1897
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1898
                $mountpage = $queryBuilder
1899
                    ->select('*')
1900
                    ->from('pages')
1901
                    ->where(
1902
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1903
                    )
1904
                    ->execute()
1905
                    ->fetchAll();
1906
                $queryBuilder->getRestrictions()->reset();
1907
1908
                // fetch mounted pages
1909
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1910
1911
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1912
                $mountTree->init('AND ' . $perms_clause);
1913
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1914
1915
                foreach ($mountTree->tree as $mountData) {
1916
                    $code .= $this->drawURLs_addRowsForPage(
1917
                        $mountData['row'],
1918
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1919
                    );
1920
                }
1921
1922
                // replace page when mount_pid_ol is enabled
1923
                if ($mountpage[0]['mount_pid_ol']) {
1924
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1925
                } else {
1926
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1927
                    $this->MP = false;
1928
                }
1929
            }
1930
1931
            $code .= $this->drawURLs_addRowsForPage(
1932
                $data['row'],
1933
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1934
            );
1935
        }
1936
1937
        return $code;
1938
    }
1939
1940
    /**
1941
     * Expands exclude string
1942
     *
1943
     * @param string $excludeString Exclude string
1944
     * @return array
1945
     */
1946 1
    public function expandExcludeString($excludeString)
1947
    {
1948
        // internal static caches;
1949 1
        static $expandedExcludeStringCache;
1950 1
        static $treeCache;
1951
1952 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1953 1
            $pidList = [];
1954
1955 1
            if (!empty($excludeString)) {
1956
                /** @var PageTreeView $tree */
1957
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1958
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1959
1960
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1961
1962
                foreach ($excludeParts as $excludePart) {
1963
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1964
1965
                    // default is "page only" = "depth=0"
1966
                    if (empty($depth)) {
1967
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1968
                    }
1969
1970
                    $pidList[] = $pid;
1971
1972
                    if ($depth > 0) {
1973
                        if (empty($treeCache[$pid][$depth])) {
1974
                            $tree->reset();
1975
                            $tree->getTree($pid, $depth);
1976
                            $treeCache[$pid][$depth] = $tree->tree;
1977
                        }
1978
1979
                        foreach ($treeCache[$pid][$depth] as $data) {
1980
                            $pidList[] = $data['row']['uid'];
1981
                        }
1982
                    }
1983
                }
1984
            }
1985
1986 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1987
        }
1988
1989 1
        return $expandedExcludeStringCache[$excludeString];
1990
    }
1991
1992
    /**
1993
     * Create the rows for display of the page tree
1994
     * For each page a number of rows are shown displaying GET variable configuration
1995
     *
1996
     * @param    array        Page row
1997
     * @param    string        Page icon and title for row
1998
     * @return    string        HTML <tr> content (one or more)
1999
     */
2000
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2001
    {
2002
        $skipMessage = '';
2003
2004
        // Get list of configurations
2005
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2006
2007
        if (!empty($this->incomingConfigurationSelection)) {
2008
            // remove configuration that does not match the current selection
2009
            foreach ($configurations as $confKey => $confArray) {
2010
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2011
                    unset($configurations[$confKey]);
2012
                }
2013
            }
2014
        }
2015
2016
        // Traverse parameter combinations:
2017
        $c = 0;
2018
        $content = '';
2019
        if (!empty($configurations)) {
2020
            foreach ($configurations as $confKey => $confArray) {
2021
2022
                    // Title column:
2023
                if (!$c) {
2024
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2025
                } else {
2026
                    $titleClm = '';
2027
                }
2028
2029
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2030
2031
                        // URL list:
2032
                    $urlList = $this->urlListFromUrlArray(
2033
                        $confArray,
2034
                        $pageRow,
2035
                        $this->scheduledTime,
2036
                        $this->reqMinute,
2037
                        $this->submitCrawlUrls,
2038
                        $this->downloadCrawlUrls,
2039
                        $this->duplicateTrack,
2040
                        $this->downloadUrls,
2041
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2042
                    );
2043
2044
                    // Expanded parameters:
2045
                    $paramExpanded = '';
2046
                    $calcAccu = [];
2047
                    $calcRes = 1;
2048
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2049
                        $paramExpanded .= '
2050
                            <tr>
2051
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2052
                                                '(' . count($gVal) . ')' .
2053
                                                '</td>
2054
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2055
                            </tr>
2056
                        ';
2057
                        $calcRes *= count($gVal);
2058
                        $calcAccu[] = count($gVal);
2059
                    }
2060
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2061
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2062
2063
                    // Options
2064
                    $optionValues = '';
2065
                    if ($confArray['subCfg']['userGroups']) {
2066
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2067
                    }
2068
                    if ($confArray['subCfg']['baseUrl']) {
2069
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2070
                    }
2071
                    if ($confArray['subCfg']['procInstrFilter']) {
2072
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2073
                    }
2074
2075
                    // Compile row:
2076
                    $content .= '
2077
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2078
                            ' . $titleClm . '
2079
                            <td>' . htmlspecialchars($confKey) . '</td>
2080
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2081
                            <td>' . $paramExpanded . '</td>
2082
                            <td nowrap="nowrap">' . $urlList . '</td>
2083
                            <td nowrap="nowrap">' . $optionValues . '</td>
2084
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2085
                        </tr>';
2086
                } else {
2087
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2088
                            ' . $titleClm . '
2089
                            <td>' . htmlspecialchars($confKey) . '</td>
2090
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2091
                        </tr>';
2092
                }
2093
2094
                $c++;
2095
            }
2096
        } else {
2097
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2098
2099
            // Compile row:
2100
            $content .= '
2101
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2102
                    <td>' . $pageTitleAndIcon . '</td>
2103
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2104
                </tr>';
2105
        }
2106
2107
        return $content;
2108
    }
2109
2110
    /*****************************
2111
     *
2112
     * CLI functions
2113
     *
2114
     *****************************/
2115
2116
    /**
2117
     * Running the functionality of the CLI (crawling URLs from queue)
2118
     *
2119
     * @param int $countInARun
2120
     * @param int $sleepTime
2121
     * @param int $sleepAfterFinish
2122
     * @return string
2123
     */
2124
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2125
    {
2126
        $result = 0;
2127
        $counter = 0;
2128
2129
        // First, run hooks:
2130
        $this->CLI_runHooks();
2131
2132
        // Clean up the queue
2133
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2134
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2135
2136
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2137
            $del = $queryBuilderDelete
2138
                ->delete($this->tableName)
2139
                ->where(
2140
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2141
                )->execute();
2142
2143
            if (false === $del) {
2144
                $this->logger->info(
2145
                    'Records could not be deleted.'
2146
                );
2147
            }
2148
        }
2149
2150
        // Select entries:
2151
        //TODO Shouldn't this reside within the transaction?
2152
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2153
        $rows = $queryBuilderSelect
2154
            ->select('qid', 'scheduled')
2155
            ->from('tx_crawler_queue')
2156
            ->where(
2157
                $queryBuilderSelect->expr()->eq('exec_time', 0),
2158
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
2159
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
2160
            )
2161
            ->orderBy('scheduled')
2162
            ->addOrderBy('qid')
2163
            ->setMaxResults($countInARun)
2164
            ->execute()
2165
            ->fetchAll();
2166
2167
        if (!empty($rows)) {
2168
            $quidList = [];
2169
2170
            foreach ($rows as $r) {
2171
                $quidList[] = $r['qid'];
2172
            }
2173
2174
            $processId = $this->CLI_buildProcessId();
2175
2176
            //reserve queue entries for process
2177
2178
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2179
            //TODO make sure we're not taking assigned queue-entires
2180
2181
            //save the number of assigned queue entrys to determine who many have been processed later
2182
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2183
            $numberOfAffectedRows = $queryBuilderUpdate
2184
                ->update('tx_crawler_queue')
2185
                ->where(
2186
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
2187
                )
2188
                ->set('process_scheduled', $this->getCurrentTime())
2189
                ->set('process_id', $queryBuilderUpdate->createNamedParameter($processId, \PDO::PARAM_STR))
2190
                ->execute();
2191
2192
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2193
                ->update(
2194
                    'tx_crawler_process',
2195
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2196
                    [ 'process_id' => (int) $processId ]
2197
                );
2198
2199
            if ($numberOfAffectedRows == count($quidList)) {
2200
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2201
            } else {
2202
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2203
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2204
                return ($result | self::CLI_STATUS_ABORTED);
2205
            }
2206
2207
            foreach ($rows as $r) {
2208
                $result |= $this->readUrl($r['qid']);
2209
2210
                $counter++;
2211
                usleep(intval($sleepTime)); // Just to relax the system
2212
2213
                // if during the start and the current read url the cli has been disable we need to return from the function
2214
                // mark the process NOT as ended.
2215
                if ($this->getDisabled()) {
2216
                    return ($result | self::CLI_STATUS_ABORTED);
2217
                }
2218
2219
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2220
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2221
2222
                    //TODO might need an additional returncode
2223
                    $result |= self::CLI_STATUS_ABORTED;
2224
                    break; //possible timeout
2225
                }
2226
            }
2227
2228
            sleep(intval($sleepAfterFinish));
2229
2230
            $msg = 'Rows: ' . $counter;
2231
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2232
        } else {
2233
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2234
        }
2235
2236
        if ($counter > 0) {
2237
            $result |= self::CLI_STATUS_PROCESSED;
2238
        }
2239
2240
        return $result;
2241
    }
2242
2243
    /**
2244
     * Activate hooks
2245
     *
2246
     * @return void
2247
     */
2248
    public function CLI_runHooks()
2249
    {
2250
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
2251
            $hookObj = GeneralUtility::makeInstance($objRef);
2252
            if (is_object($hookObj)) {
2253
                $hookObj->crawler_init($this);
2254
            }
2255
        }
2256
    }
2257
2258
    /**
2259
     * Try to acquire a new process with the given id
2260
     * also performs some auto-cleanup for orphan processes
2261
     * @todo preemption might not be the most elegant way to clean up
2262
     *
2263
     * @param string $id identification string for the process
2264
     * @return boolean
2265
     */
2266
    public function CLI_checkAndAcquireNewProcess($id)
2267
    {
2268
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2269
        $ret = true;
2270
2271
        $systemProcessId = getmypid();
2272
        if ($systemProcessId < 1) {
2273
            return false;
2274
        }
2275
2276
        $processCount = 0;
2277
        $orphanProcesses = [];
2278
2279
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2280
2281
        $statement = $queryBuilder
2282
            ->select('process_id', 'ttl')
2283
            ->from('tx_crawler_process')
2284
            ->where(
2285
                'active = 1 AND deleted = 0'
2286
            )
2287
            ->execute();
2288
2289
        $currentTime = $this->getCurrentTime();
2290
2291
        while ($row = $statement->fetch()) {
2292
            if ($row['ttl'] < $currentTime) {
2293
                $orphanProcesses[] = $row['process_id'];
2294
            } else {
2295
                $processCount++;
2296
            }
2297
        }
2298
2299
        // if there are less than allowed active processes then add a new one
2300
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2301
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2302
2303
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2304
                'tx_crawler_process',
2305
                [
2306
                    'process_id' => $id,
2307
                    'active' => 1,
2308
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2309
                    'system_process_id' => $systemProcessId
2310
                ]
2311
            );
2312
        } else {
2313
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2314
            $ret = false;
2315
        }
2316
2317
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2318
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2319
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2320
2321
        return $ret;
2322
    }
2323
2324
    /**
2325
     * Release a process and the required resources
2326
     *
2327
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2328
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2329
     * @return boolean
2330
     */
2331
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2332
    {
2333
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2334
2335
        if (!is_array($releaseIds)) {
2336
            $releaseIds = [$releaseIds];
2337
        }
2338
2339
        if (empty($releaseIds)) {
2340
            return false;   //nothing to release
2341
        }
2342
2343
        if (!$withinLock) {
2344
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2345
        }
2346
2347
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2348
        // this ensures that a single process can't mess up the entire process table
2349
2350
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2351
2352
        $queryBuilder
2353
        ->update('tx_crawler_queue', 'q')
2354
        ->where(
2355
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2356
        )
2357
        ->set('q.process_scheduled', 0)
2358
        ->set('q.process_id', '')
2359
        ->execute();
2360
2361
        // FIXME: Not entirely sure that this is equivalent to the previous version
2362
        $queryBuilder->resetQueryPart('set');
2363
2364
        $queryBuilder
2365
            ->update('tx_crawler_process')
2366
            ->where(
2367
                $queryBuilder->expr()->eq('active', 0),
2368
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2369
            )
2370
            ->set('system_process_id', 0)
2371
            ->execute();
2372
        // previous version for reference
2373
        /*
2374
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2375
            'tx_crawler_process',
2376
            'active=0 AND deleted=0
2377
            AND NOT EXISTS (
2378
                SELECT * FROM tx_crawler_queue
2379
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2380
                AND tx_crawler_queue.exec_time = 0
2381
            )',
2382
            [
2383
                'deleted' => '1',
2384
                'system_process_id' => 0
2385
            ]
2386
        );*/
2387
        // mark all requested processes as non-active
2388
        $queryBuilder
2389
            ->update('tx_crawler_process')
2390
            ->where(
2391
                'NOT EXISTS (
2392
                SELECT * FROM tx_crawler_queue
2393
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2394
                    AND tx_crawler_queue.exec_time = 0
2395
                )',
2396
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2397
                $queryBuilder->expr()->eq('deleted', 0)
2398
            )
2399
            ->set('active', 0)
2400
            ->execute();
2401
        $queryBuilder->resetQueryPart('set');
2402
        $queryBuilder
2403
            ->update('tx_crawler_queue')
2404
            ->where(
2405
                $queryBuilder->expr()->eq('exec_time', 0),
2406
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2407
            )
2408
            ->set('process_scheduled', 0)
2409
            ->set('process_id', '')
2410
            ->execute();
2411
2412
        if (!$withinLock) {
2413
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2414
        }
2415
2416
        return true;
2417
    }
2418
2419
    /**
2420
     * Check if there are still resources left for the process with the given id
2421
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2422
     *
2423
     * @param  string  identification string for the process
2424
     * @return boolean determines if the process is still active / has resources
2425
     *
2426
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2427
     */
2428 1
    public function CLI_checkIfProcessIsActive($pid)
2429
    {
2430 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2431 1
        $ret = false;
2432
2433
        $statement = $queryBuilder
2434 1
            ->from('tx_crawler_process')
2435 1
            ->select('active')
2436 1
            ->where(
2437 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2438
            )
2439 1
            ->orderBy('ttl')
2440 1
            ->execute();
2441
2442 1
        if ($row = $statement->fetch(0)) {
2443 1
            $ret = intVal($row['active']) == 1;
2444
        }
2445
2446 1
        return $ret;
2447
    }
2448
2449
    /**
2450
     * Create a unique Id for the current process
2451
     *
2452
     * @return string  the ID
2453
     */
2454 2
    public function CLI_buildProcessId()
2455
    {
2456 2
        if (!$this->processID) {
2457 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2458
        }
2459 2
        return $this->processID;
2460
    }
2461
2462
    /**
2463
     * @param bool $get_as_float
2464
     *
2465
     * @return mixed
2466
     */
2467
    protected function microtime($get_as_float = false)
2468
    {
2469
        return microtime($get_as_float);
2470
    }
2471
2472
    /**
2473
     * Prints a message to the stdout (only if debug-mode is enabled)
2474
     *
2475
     * @param  string $msg  the message
2476
     */
2477
    public function CLI_debug($msg)
2478
    {
2479
        if (intval($this->extensionSettings['processDebug'])) {
2480
            echo $msg . "\n";
2481
            flush();
2482
        }
2483
    }
2484
2485
    /**
2486
     * Get URL content by making direct request to TYPO3.
2487
     *
2488
     * @param  string $url          Page URL
2489
     * @param  int    $crawlerId    Crawler-ID
2490
     * @return array
2491
     */
2492 2
    protected function sendDirectRequest($url, $crawlerId)
2493
    {
2494 2
        $parsedUrl = parse_url($url);
2495 2
        if (!is_array($parsedUrl)) {
2496
            return [];
2497
        }
2498
2499 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2500
2501 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2502 2
        $cmd .= ' ';
2503 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2504 2
        $cmd .= ' ';
2505 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2506 2
        $cmd .= ' ';
2507 2
        $cmd .= escapeshellarg($url);
2508 2
        $cmd .= ' ';
2509 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2510
2511 2
        $startTime = microtime(true);
2512 2
        $content = $this->executeShellCommand($cmd);
2513 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2514
2515
        $result = [
2516 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2517 2
            'headers' => '',
2518 2
            'content' => $content
2519
        ];
2520
2521 2
        return $result;
2522
    }
2523
2524
    /**
2525
     * Cleans up entries that stayed for too long in the queue. These are:
2526
     * - processed entries that are over 1.5 days in age
2527
     * - scheduled entries that are over 7 days old
2528
     *
2529
     * @return void
2530
     */
2531
    public function cleanUpOldQueueEntries()
2532
    {
2533
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2534
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2535
2536
        $now = time();
2537
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2538
        $this->flushQueue($condition);
2539
    }
2540
2541
    /**
2542
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2543
     *
2544
     * @param int $pageId
2545
     * @return void
2546
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2547
     * @throws \TYPO3\CMS\Core\Http\ImmediateResponseException
2548
     */
2549
    protected function initTSFE(int $pageId): void
2550
    {
2551
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(
2552
            TypoScriptFrontendController::class,
2553
            null,
2554
            $pageId,
2555
            0
2556
        );
2557
        $GLOBALS['TSFE']->initFEuser();
2558
        $GLOBALS['TSFE']->determineId();
2559
        $GLOBALS['TSFE']->getConfigArray();
2560
        $GLOBALS['TSFE']->settingLanguage();
2561
        $GLOBALS['TSFE']->settingLocale();
2562
        $GLOBALS['TSFE']->newCObj();
2563
    }
2564
2565
    /**
2566
     * Returns a md5 hash generated from a serialized configuration array.
2567
     *
2568
     * @param array $configuration
2569
     *
2570
     * @return string
2571
     */
2572 7
    protected function getConfigurationHash(array $configuration)
2573
    {
2574 7
        unset($configuration['paramExpanded']);
2575 7
        unset($configuration['URLs']);
2576 7
        return md5(serialize($configuration));
2577
    }
2578
}
2579