Completed
Push — typo3v9 ( 7f7f07...70150c )
by Tomas Norre
06:02
created

CrawlerController::checkIfPageShouldBeSkipped()   F

Complexity

Conditions 14
Paths 360

Size

Total Lines 53

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 18.5707

Importance

Changes 0
Metric Value
cc 14
nc 360
nop 1
dl 0
loc 53
ccs 20
cts 28
cp 0.7143
crap 18.5707
rs 3.4333
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use Psr\Log\LoggerAwareInterface;
35
use Psr\Log\LoggerAwareTrait;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Core\Environment;
40
use TYPO3\CMS\Core\Database\Connection;
41
use TYPO3\CMS\Core\Database\ConnectionPool;
42
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
43
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
44
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
45
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
46
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
47
use TYPO3\CMS\Core\Utility\DebugUtility;
48
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
49
use TYPO3\CMS\Core\Utility\GeneralUtility;
50
use TYPO3\CMS\Core\Utility\MathUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController implements LoggerAwareInterface
61
{
62
    use LoggerAwareTrait;
63
64
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
65
    const CLI_STATUS_REMAIN = 1; //queue not empty
66
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
67
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
68
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
69
70
    /**
71
     * @var integer
72
     */
73
    public $setID = 0;
74
75
    /**
76
     * @var string
77
     */
78
    public $processID = '';
79
80
    /**
81
     * @var array
82
     */
83
    public $duplicateTrack = [];
84
85
    /**
86
     * @var array
87
     */
88
    public $downloadUrls = [];
89
90
    /**
91
     * @var array
92
     */
93
    public $incomingProcInstructions = [];
94
95
    /**
96
     * @var array
97
     */
98
    public $incomingConfigurationSelection = [];
99
100
    /**
101
     * @var bool
102
     */
103
    public $registerQueueEntriesInternallyOnly = false;
104
105
    /**
106
     * @var array
107
     */
108
    public $queueEntries = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $urlList = [];
114
115
    /**
116
     * @var array
117
     */
118
    public $extensionSettings = [];
119
120
    /**
121
     * Mount Point
122
     *
123
     * @var boolean
124
     */
125
    public $MP = false;
126
127
    /**
128
     * @var string
129
     */
130
    protected $processFilename;
131
132
    /**
133
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
134
     *
135
     * @var string
136
     */
137
    protected $accessMode;
138
139
    /**
140
     * @var BackendUserAuthentication
141
     */
142
    private $backendUser;
143
144
    /**
145
     * @var integer
146
     */
147
    private $scheduledTime = 0;
148
149
    /**
150
     * @var integer
151
     */
152
    private $reqMinute = 0;
153
154
    /**
155
     * @var bool
156
     */
157
    private $submitCrawlUrls = false;
158
159
    /**
160
     * @var bool
161
     */
162
    private $downloadCrawlUrls = false;
163
164
    /**
165
     * @var QueueRepository
166
     */
167
    protected $queueRepository;
168
169
    /**
170
     * @var ProcessRepository
171
     */
172
    protected $processRepository;
173
174
    /**
175
     * @var string
176
     */
177
    protected $tableName = 'tx_crawler_queue';
178
179
    /**
180
     * Method to set the accessMode can be gui, cli or cli_im
181
     *
182
     * @return string
183
     */
184 1
    public function getAccessMode()
185
    {
186 1
        return $this->accessMode;
187
    }
188
189
    /**
190
     * @param string $accessMode
191
     */
192 1
    public function setAccessMode($accessMode)
193
    {
194 1
        $this->accessMode = $accessMode;
195 1
    }
196
197
    /**
198
     * Set disabled status to prevent processes from being processed
199
     *
200
     * @param  bool $disabled (optional, defaults to true)
201
     * @return void
202
     */
203 3
    public function setDisabled($disabled = true)
204
    {
205 3
        if ($disabled) {
206 2
            GeneralUtility::writeFile($this->processFilename, '');
207
        } else {
208 1
            if (is_file($this->processFilename)) {
209 1
                unlink($this->processFilename);
210
            }
211
        }
212 3
    }
213
214
    /**
215
     * Get disable status
216
     *
217
     * @return bool true if disabled
218
     */
219 3
    public function getDisabled()
220
    {
221 3
        return is_file($this->processFilename);
222
    }
223
224
    /**
225
     * @param string $filenameWithPath
226
     *
227
     * @return void
228
     */
229 4
    public function setProcessFilename($filenameWithPath)
230
    {
231 4
        $this->processFilename = $filenameWithPath;
232 4
    }
233
234
    /**
235
     * @return string
236
     */
237 1
    public function getProcessFilename()
238
    {
239 1
        return $this->processFilename;
240
    }
241
242
    /************************************
243
     *
244
     * Getting URLs based on Page TSconfig
245
     *
246
     ************************************/
247
248 31
    public function __construct()
249
    {
250 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
251 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
252 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
253
254 31
        $this->backendUser = $GLOBALS['BE_USER'];
255 31
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
256
257
        /** @var ExtensionConfigurationProvider $configurationProvider */
258 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
259 31
        $settings = $configurationProvider->getExtensionConfiguration();
260 31
        $this->extensionSettings = is_array($settings) ? $settings : [];
261
262
        // set defaults:
263 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
264
            $this->extensionSettings['countInARun'] = 100;
265
        }
266
267 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
268 31
    }
269
270
    /**
271
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
272
     *
273
     * @param array $extensionSettings
274
     * @return void
275
     */
276 9
    public function setExtensionSettings(array $extensionSettings)
277
    {
278 9
        $this->extensionSettings = $extensionSettings;
279 9
    }
280
281
    /**
282
     * Check if the given page should be crawled
283
     *
284
     * @param array $pageRow
285
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
286
     */
287 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
288
    {
289 8
        $skipPage = false;
290 8
        $skipMessage = 'Skipped'; // message will be overwritten later
291
292
        // if page is hidden
293 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
294 8
            if ($pageRow['hidden']) {
295 1
                $skipPage = true;
296 1
                $skipMessage = 'Because page is hidden';
297
            }
298
        }
299
300 8
        if (!$skipPage) {
301 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
302 3
                $skipPage = true;
303 3
                $skipMessage = 'Because doktype is not allowed';
304
            }
305
        }
306
307 8
        if (!$skipPage) {
308 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
309 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
310 1
                    $skipPage = true;
311 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
312 1
                    break;
313
                }
314
            }
315
        }
316
317 8
        if (!$skipPage) {
318
            // veto hook
319 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
320
                $params = [
321
                    'pageRow' => $pageRow
322
                ];
323
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
324
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
325
                if ($veto !== false) {
326
                    $skipPage = true;
327
                    if (is_string($veto)) {
328
                        $skipMessage = $veto;
329
                    } else {
330
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
331
                    }
332
                    // no need to execute other hooks if a previous one return a veto
333
                    break;
334
                }
335
            }
336
        }
337
338 8
        return $skipPage ? $skipMessage : false;
339
    }
340
341
    /**
342
     * Wrapper method for getUrlsForPageId()
343
     * It returns an array of configurations and no urls!
344
     *
345
     * @param array $pageRow Page record with at least dok-type and uid columns.
346
     * @param string $skipMessage
347
     * @return array
348
     * @see getUrlsForPageId()
349
     */
350 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
351
    {
352 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
353
354 4
        if ($message === false) {
355 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
356 3
            $skipMessage = '';
357
        } else {
358 1
            $skipMessage = $message;
359 1
            $res = [];
360
        }
361
362 4
        return $res;
363
    }
364
365
    /**
366
     * This method is used to count if there are ANY unprocessed queue entries
367
     * of a given page_id and the configuration which matches a given hash.
368
     * If there if none, we can skip an inner detail check
369
     *
370
     * @param  int $uid
371
     * @param  string $configurationHash
372
     * @return boolean
373
     */
374 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
375
    {
376 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
377 5
        $noUnprocessedQueueEntriesFound = true;
378
379
        $result = $queryBuilder
380 5
            ->count('*')
381 5
            ->from($this->tableName)
382 5
            ->where(
383 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
384 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
385 5
                $queryBuilder->expr()->eq('exec_time', 0)
386
            )
387 5
            ->execute()
388 5
            ->fetchColumn();
389
390 5
        if ($result) {
391 3
            $noUnprocessedQueueEntriesFound = false;
392
        }
393
394 5
        return $noUnprocessedQueueEntriesFound;
395
    }
396
397
    /**
398
     * Creates a list of URLs from input array (and submits them to queue if asked for)
399
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
400
     *
401
     * @param    array        Information about URLs from pageRow to crawl.
402
     * @param    array        Page row
403
     * @param    integer        Unix time to schedule indexing to, typically time()
404
     * @param    integer        Number of requests per minute (creates the interleave between requests)
405
     * @param    boolean        If set, submits the URLs to queue
406
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
407
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
408
     * @param    array        Array which will be filled with URLS for download if flag is set.
409
     * @param    array        Array of processing instructions
410
     * @return    string        List of URLs (meant for display in backend module)
411
     *
412
     */
413 2
    public function urlListFromUrlArray(
414
        array $vv,
415
        array $pageRow,
416
        $scheduledTime,
417
        $reqMinute,
418
        $submitCrawlUrls,
419
        $downloadCrawlUrls,
420
        array &$duplicateTrack,
421
        array &$downloadUrls,
422
        array $incomingProcInstructions
423
    ) {
424 2
        $urlList = '';
425
426 2
        if (is_array($vv['URLs'])) {
427 2
            $configurationHash = $this->getConfigurationHash($vv);
428 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
429
430 2
            foreach ($vv['URLs'] as $urlQuery) {
431 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
432
433
                    // Calculate cHash:
434 2
                    if ($vv['subCfg']['cHash']) {
435
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
436
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
437
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
438
                    }
439
440
                    // Create key by which to determine unique-ness:
441 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
442 2
                    $urlQuery = 'index.php' . $urlQuery;
443
444
                    // Scheduled time:
445 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
446 2
                    $schTime = floor($schTime / 60) * 60;
447
448 2
                    if (isset($duplicateTrack[$uKey])) {
449
450
                        //if the url key is registered just display it and do not resubmit is
451
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
452
                    } else {
453 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
454 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
455
456 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
457
458
                        // Submit for crawling!
459 2
                        if ($submitCrawlUrls) {
460 2
                            $added = $this->addUrl(
461 2
                                $pageRow['uid'],
462 2
                                $theUrl,
463 2
                                $vv['subCfg'],
464 2
                                $scheduledTime,
465 2
                                $configurationHash,
466 2
                                $skipInnerCheck
467
                            );
468 2
                            if ($added === false) {
469 2
                                $urlList .= ' (Url already existed)';
470
                            }
471
                        } elseif ($downloadCrawlUrls) {
472
                            $downloadUrls[$theUrl] = $theUrl;
473
                        }
474
475 2
                        $urlList .= '<br />';
476
                    }
477 2
                    $duplicateTrack[$uKey] = true;
478
                }
479
            }
480
        } else {
481
            $urlList = 'ERROR - no URL generated';
482
        }
483
484 2
        return $urlList;
485
    }
486
487
    /**
488
     * Returns true if input processing instruction is among registered ones.
489
     *
490
     * @param string $piString PI to test
491
     * @param array $incomingProcInstructions Processing instructions
492
     * @return boolean
493
     */
494 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
495
    {
496 5
        if (empty($incomingProcInstructions)) {
497 1
            return true;
498
        }
499
500 4
        foreach ($incomingProcInstructions as $pi) {
501 4
            if (GeneralUtility::inList($piString, $pi)) {
502 2
                return true;
503
            }
504
        }
505 2
        return false;
506
    }
507
508 2
    public function getPageTSconfigForId($id)
509
    {
510 2
        if (!$this->MP) {
511 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
512
        } else {
513
            list(, $mountPointId) = explode('-', $this->MP);
514
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
515
        }
516
517
        // Call a hook to alter configuration
518 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
519
            $params = [
520
                'pageId' => $id,
521
                'pageTSConfig' => &$pageTSconfig
522
            ];
523
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
524
                GeneralUtility::callUserFunction($userFunc, $params, $this);
525
            }
526
        }
527
528 2
        return $pageTSconfig;
529
    }
530
531
    /**
532
     * This methods returns an array of configurations.
533
     * And no urls!
534
     *
535
     * @param integer $id Page ID
536
     * @return array
537
     */
538 2
    public function getUrlsForPageId($id)
539
    {
540
541
        /**
542
         * Get configuration from tsConfig
543
         */
544
545
        // Get page TSconfig for page ID:
546 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
547
548 2
        $res = [];
549
550 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
551 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
552
553 1
            if (is_array($crawlerCfg['paramSets.'])) {
554 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
555 1
                    if (is_array($values)) {
556 1
                        $key = str_replace('.', '', $key);
557
                        // Sub configuration for a single configuration string:
558 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
559 1
                        $subCfg['key'] = $key;
560
561 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
562 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
                        }
564 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
                        // process configuration if it is not page-specific or if the specific page is the current page:
567 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
568
569
                                // add trailing slash if not present
570 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
571
                                $subCfg['baseUrl'] .= '/';
572
                            }
573
574
                            // Explode, process etc.:
575 1
                            $res[$key] = [];
576 1
                            $res[$key]['subCfg'] = $subCfg;
577 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
578 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
579 1
                            $res[$key]['origin'] = 'pagets';
580
581
                            // recognize MP value
582 1
                            if (!$this->MP) {
583 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
584
                            } else {
585
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
586
                            }
587
                        }
588
                    }
589
                }
590
            }
591
        }
592
593
        /**
594
         * Get configuration from tx_crawler_configuration records
595
         */
596
597
        // get records along the rootline
598 2
        $rootLine = BackendUtility::BEgetRootLine($id);
599
600 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
601
        $queryBuilder
602 2
            ->getRestrictions()->removeAll()
603 2
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
604 2
            ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
605
606 2
        foreach ($rootLine as $page) {
607
            $configurationRecordsForCurrentPage = $queryBuilder
608 2
                ->select('*')
609 2
                ->from('tx_crawler_configuration')
610 2
                ->where(
611 2
                    $queryBuilder->expr()->eq('pid', $page['uid'])
612
                )
613 2
                ->execute()
614 2
                ->fetchAll();
615
616 2
            foreach ($configurationRecordsForCurrentPage ?? [] as $configurationRecord) {
617
618
                    // check access to the configuration record
619 1
                if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
620 1
                    $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
621
622
                    // process configuration if it is not page-specific or if the specific page is the current page:
623 1
                    if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
624 1
                        $key = $configurationRecord['name'];
625
626
                        // don't overwrite previously defined paramSets
627 1
                        if (!isset($res[$key])) {
628
629
                                /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
630 1
                            $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
631 1
                            $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
632
633
                            $subCfg = [
634 1
                                'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
635 1
                                'procInstrParams.' => $TSparserObject->setup,
636 1
                                'baseUrl' => $this->getBaseUrlForConfigurationRecord(
637 1
                                    $configurationRecord['base_url'],
638 1
                                    (int)$configurationRecord['sys_domain_base_url'],
639 1
                                    (bool)($configurationRecord['force_ssl'] > 0)
640
                                ),
641 1
                                'cHash' => $configurationRecord['chash'],
642 1
                                'userGroups' => $configurationRecord['fegroups'],
643 1
                                'exclude' => $configurationRecord['exclude'],
644 1
                                'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
645 1
                                'key' => $key
646
                            ];
647
648
                            // add trailing slash if not present
649 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
650
                                $subCfg['baseUrl'] .= '/';
651
                            }
652 1
                            if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
653 1
                                $res[$key] = [];
654 1
                                $res[$key]['subCfg'] = $subCfg;
655 1
                                $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
656 1
                                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
657 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
658 1
                                $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
659
                            }
660
                        }
661
                    }
662
                }
663
            }
664
        }
665
666 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
667
            $params = [
668
                'res' => &$res,
669
            ];
670
            GeneralUtility::callUserFunction($func, $params, $this);
671
        }
672 2
        return $res;
673
    }
674
675
    /**
676
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
677
     *
678
     * @param string $baseUrl
679
     * @param integer $sysDomainUid
680
     * @param bool $ssl
681
     * @return string
682
     */
683 4
    protected function getBaseUrlForConfigurationRecord(string $baseUrl, int $sysDomainUid, bool $ssl = false): string
684
    {
685 4
        if ($sysDomainUid > 0) {
686 2
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('sys_domain');
687
            $domainName = $queryBuilder
688 2
                ->select('domainName')
689 2
                ->from('sys_domain')
690 2
                ->where(
691 2
                    $queryBuilder->expr()->eq('uid', $sysDomainUid)
692
                )
693 2
                ->execute()
694 2
                ->fetchColumn();
695
696 2
            if (!empty($domainName)) {
697 1
                $baseUrl = ($ssl ? 'https' : 'http') . '://' . $domainName;
698
            }
699
        }
700 4
        return $baseUrl;
701
    }
702
703
    /**
704
     * @param $rootid
705
     * @param $depth
706
     * @return array
707
     *
708
     * TODO: Write Functional Tests
709
     */
710
    public function getConfigurationsForBranch($rootid, $depth)
711
    {
712
        $configurationsForBranch = [];
713
714
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
715
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
716
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
717
            if (is_array($sets)) {
718
                foreach ($sets as $key => $value) {
719
                    if (!is_array($value)) {
720
                        continue;
721
                    }
722
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
723
                }
724
            }
725
        }
726
        $pids = [];
727
        $rootLine = BackendUtility::BEgetRootLine($rootid);
728
        foreach ($rootLine as $node) {
729
            $pids[] = $node['uid'];
730
        }
731
        /* @var PageTreeView $tree */
732
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
733
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
734
        $tree->init('AND ' . $perms_clause);
735
        $tree->getTree($rootid, $depth, '');
736
        foreach ($tree->tree as $node) {
737
            $pids[] = $node['row']['uid'];
738
        }
739
740
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
741
742
        $queryBuilder->getRestrictions()
743
            ->removeAll()
744
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
745
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
746
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
747
748
        $statement = $queryBuilder
749
            ->select('name')
750
            ->from('tx_crawler_configuration')
751
            ->where(
752
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
753
            )
754
        ->execute();
755
756
        while ($row = $statement->fetch()) {
757
            $configurationsForBranch[] = $row['name'];
758
        }
759
760
        return $configurationsForBranch;
761
    }
762
763
    /**
764
     * Get querybuilder for given table
765
     *
766
     * @param string $table
767
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
768
     */
769 9
    private function getQueryBuilder(string $table)
770
    {
771 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
772 9
            ->getConnectionForTable($table)
773 9
            ->createQueryBuilder();
774
    }
775
776
    /**
777
     * Check if a user has access to an item
778
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
779
     *
780
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
781
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
782
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
783
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
784
     */
785 3
    public function hasGroupAccess($groupList, $accessList)
786
    {
787 3
        if (empty($accessList)) {
788 1
            return true;
789
        }
790 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
791 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
792 1
                return true;
793
            }
794
        }
795 1
        return false;
796
    }
797
798
    /**
799
     * Parse GET vars of input Query into array with key=>value pairs
800
     *
801
     * @param string $inputQuery Input query string
802
     * @return array
803
     */
804 5
    public function parseParams($inputQuery)
805
    {
806
        //echo '<pre>', var_dump($inputQuery), '</pre>';
807
        // Extract all GET parameters into an ARRAY:
808 5
        $paramKeyValues = [];
809 5
        $GETparams = explode('&', $inputQuery);
810
811 5
        foreach ($GETparams as $paramAndValue) {
812 5
            list($p, $v) = explode('=', $paramAndValue, 2);
813 5
            if (strlen($p)) {
814 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
815
            }
816
        }
817
818 5
        return $paramKeyValues;
819
    }
820
821
    /**
822
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
823
     * Syntax of values:
824
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
825
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
826
     * - For each configuration part:
827
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
828
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
829
     *        _ENABLELANG:1 picks only original records without their language overlays
830
     *         - Default: Literal value
831
     *
832
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
833
     * @param integer $pid Current page ID
834
     * @return array
835
     *
836
     * TODO: Write Functional Tests
837
     */
838 2
    public function expandParameters($paramArray, $pid)
839
    {
840
        // Traverse parameter names:
841 2
        foreach ($paramArray as $p => $v) {
842 2
            $v = trim($v);
843
844
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
845 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
846
                // So, find the value inside brackets and reset the paramArray value as an array.
847 2
                $v = substr($v, 1, -1);
848 2
                $paramArray[$p] = [];
849
850
                // Explode parts and traverse them:
851 2
                $parts = explode('|', $v);
852 2
                foreach ($parts as $pV) {
853
854
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
855 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
856
857
                        // Swap if first is larger than last:
858
                        if ($reg[1] > $reg[2]) {
859
                            $temp = $reg[2];
860
                            $reg[2] = $reg[1];
861
                            $reg[1] = $temp;
862
                        }
863
864
                        // Traverse range, add values:
865
                        $runAwayBrake = 1000; // Limit to size of range!
866
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
867
                            $paramArray[$p][] = $a;
868
                            $runAwayBrake--;
869
                            if ($runAwayBrake <= 0) {
870
                                break;
871
                            }
872
                        }
873 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
874
875
                        // Parse parameters:
876
                        $subparts = GeneralUtility::trimExplode(';', $pV);
877
                        $subpartParams = [];
878
                        foreach ($subparts as $spV) {
879
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
880
                            $subpartParams[$pKey] = $pVal;
881
                        }
882
883
                        // Table exists:
884
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
885
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
886
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
887
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
888
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
889
890
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
891
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
892
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
893
894
                                $queryBuilder->getRestrictions()
895
                                    ->removeAll()
896
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
897
898
                                $queryBuilder
899
                                    ->select($fieldName)
900
                                    ->from($subpartParams['_TABLE'])
901
                                    // TODO: Check if this works as intended!
902
                                    ->add('from', $addTable)
903
                                    ->where(
904
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
905
                                        $where
906
                                    );
907
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
908
909
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
910
                                    $queryBuilder->andWhere(
911
                                        $queryBuilder->expr()->lte(
912
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
913
                                            0
914
                                        )
915
                                    );
916
                                }
917
918
                                $statement = $queryBuilder->execute();
919
920
                                $rows = [];
921
                                while ($row = $statement->fetch()) {
922
                                    $rows[$fieldName] = $row;
923
                                }
924
925
                                if (is_array($rows)) {
926
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
927
                                }
928
                            }
929
                        }
930
                    } else { // Just add value:
931 2
                        $paramArray[$p][] = $pV;
932
                    }
933
                    // Hook for processing own expandParameters place holder
934 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
935
                        $_params = [
936
                            'pObj' => &$this,
937
                            'paramArray' => &$paramArray,
938
                            'currentKey' => $p,
939
                            'currentValue' => $pV,
940
                            'pid' => $pid
941
                        ];
942
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
943
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
944
                        }
945
                    }
946
                }
947
948
                // Make unique set of values and sort array by key:
949 2
                $paramArray[$p] = array_unique($paramArray[$p]);
950 2
                ksort($paramArray);
951
            } else {
952
                // Set the literal value as only value in array:
953 2
                $paramArray[$p] = [$v];
954
            }
955
        }
956
957 2
        return $paramArray;
958
    }
959
960
    /**
961
     * Compiling URLs from parameter array (output of expandParameters())
962
     * The number of URLs will be the multiplication of the number of parameter values for each key
963
     *
964
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
965
     * @param array $urls URLs accumulated in this array (for recursion)
966
     * @return array
967
     */
968 5
    public function compileUrls($paramArray, $urls = [])
969
    {
970 5
        if (!empty($paramArray) && is_array($urls)) {
971
            // shift first off stack:
972 4
            reset($paramArray);
973 4
            $varName = key($paramArray);
974 4
            $valueSet = array_shift($paramArray);
975
976
            // Traverse value set:
977 4
            $newUrls = [];
978 4
            $maxCompileUrls = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
979 4
            foreach ($urls as $url) {
980 3
                foreach ($valueSet as $val) {
981 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
982
983 3
                    if (count($newUrls) > $maxCompileUrls) {
984
                        break;
985
                    }
986
                }
987
            }
988 4
            $urls = $newUrls;
989 4
            $urls = $this->compileUrls($paramArray, $urls);
990
        }
991
992 5
        return $urls;
993
    }
994
995
    /************************************
996
     *
997
     * Crawler log
998
     *
999
     ************************************/
1000
1001
    /**
1002
     * Return array of records from crawler queue for input page ID
1003
     *
1004
     * @param integer $id Page ID for which to look up log entries.
1005
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1006
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1007
     * @param boolean $doFullFlush
1008
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1009
     * @return array
1010
     */
1011 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1012
    {
1013 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1014
        $queryBuilder
1015 4
            ->select('*')
1016 4
            ->from($this->tableName)
1017 4
            ->where(
1018 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1019
            )
1020 4
            ->orderBy('scheduled', 'DESC');
1021
1022 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1023 4
            ->getConnectionForTable($this->tableName)
1024 4
            ->getExpressionBuilder();
1025 4
        $query = $expressionBuilder->andX();
1026
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1027
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1028
        // between the statements, it's not a mistake in the code.
1029 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1030 4
        switch ($filter) {
1031 4
            case 'pending':
1032
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1033
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1034
                break;
1035 4
            case 'finished':
1036
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1037
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1038
                break;
1039
        }
1040
1041
        // FIXME: Write unit test that ensures that the right records are deleted.
1042 4
        if ($doFlush) {
1043 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1044 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1045 2
            return [];
1046
        } else {
1047 2
            if ($itemsPerPage > 0) {
1048
                $queryBuilder
1049 2
                    ->setMaxResults((int)$itemsPerPage);
1050
            }
1051
1052 2
            return $queryBuilder->execute()->fetchAll();
1053
        }
1054
    }
1055
1056
    /**
1057
     * Return array of records from crawler queue for input set ID
1058
     *
1059
     * @param integer $set_id Set ID for which to look up log entries.
1060
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1061
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1062
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1063
     * @return array
1064
     */
1065 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1066
    {
1067 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1068
        $queryBuilder
1069 6
            ->select('*')
1070 6
            ->from($this->tableName)
1071 6
            ->where(
1072 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1073
            )
1074 6
            ->orderBy('scheduled', 'DESC');
1075
1076 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1077 6
            ->getConnectionForTable($this->tableName)
1078 6
            ->getExpressionBuilder();
1079 6
        $query = $expressionBuilder->andX();
1080
        // FIXME: Write Unit tests for Filters
1081
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1082
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1083
        // between the statements, it's not a mistake in the code.
1084 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1085 6
        switch ($filter) {
1086 6
            case 'pending':
1087 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1088 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1089 1
                break;
1090 5
            case 'finished':
1091 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1092 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1093 1
                break;
1094
        }
1095
        // FIXME: Write unit test that ensures that the right records are deleted.
1096 6
        if ($doFlush) {
1097 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1098 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1099 4
            return [];
1100
        } else {
1101 2
            if ($itemsPerPage > 0) {
1102
                $queryBuilder
1103 2
                    ->setMaxResults((int)$itemsPerPage);
1104
            }
1105
1106 2
            return $queryBuilder->execute()->fetchAll();
1107
        }
1108
    }
1109
1110
    /**
1111
     * Removes queue entries
1112
     *
1113
     * @param string $where SQL related filter for the entries which should be removed
1114
     * @return void
1115
     */
1116 9
    protected function flushQueue($where = '')
1117
    {
1118 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1119
1120 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1121
1122 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1123
            $groups = $queryBuilder
1124
                ->select('DISTINCT set_id')
1125
                ->from($this->tableName)
1126
                ->where($realWhere)
1127
                ->execute()
1128
                ->fetchAll();
1129
            if (is_array($groups)) {
1130
                foreach ($groups as $group) {
1131
                    $subSet = $queryBuilder
1132
                        ->select('uid', 'set_id')
1133
                        ->from($this->tableName)
1134
                        ->where(
1135
                            $realWhere,
1136
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1137
                        )
1138
                        ->execute()
1139
                        ->fetchAll();
1140
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1141
                }
1142
            }
1143
        }
1144
1145
        $queryBuilder
1146 9
            ->delete($this->tableName)
1147 9
            ->where($realWhere)
1148 9
            ->execute();
1149 9
    }
1150
1151
    /**
1152
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1153
     *
1154
     * @param integer $setId Set ID
1155
     * @param array $params Parameters to pass to call back function
1156
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1157
     * @param integer $page_id Page ID to attach it to
1158
     * @param integer $schedule Time at which to activate
1159
     * @return void
1160
     */
1161
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1162
    {
1163
        if (!is_array($params)) {
1164
            $params = [];
1165
        }
1166
        $params['_CALLBACKOBJ'] = $callBack;
1167
1168
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1169
            ->insert(
1170
                'tx_crawler_queue',
1171
                [
1172
                    'page_id' => intval($page_id),
1173
                    'parameters' => serialize($params),
1174
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1175
                    'exec_time' => 0,
1176
                    'set_id' => intval($setId),
1177
                    'result_data' => '',
1178
                ]
1179
            );
1180
    }
1181
1182
    /************************************
1183
     *
1184
     * URL setting
1185
     *
1186
     ************************************/
1187
1188
    /**
1189
     * Setting a URL for crawling:
1190
     *
1191
     * @param integer $id Page ID
1192
     * @param string $url Complete URL
1193
     * @param array $subCfg Sub configuration array (from TS config)
1194
     * @param integer $tstamp Scheduled-time
1195
     * @param string $configurationHash (optional) configuration hash
1196
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1197
     * @return bool
1198
     */
1199 2
    public function addUrl(
1200
        $id,
1201
        $url,
1202
        array $subCfg,
1203
        $tstamp,
1204
        $configurationHash = '',
1205
        $skipInnerDuplicationCheck = false
1206
    ) {
1207 2
        $urlAdded = false;
1208 2
        $rows = [];
1209
1210
        // Creating parameters:
1211
        $parameters = [
1212 2
            'url' => $url
1213
        ];
1214
1215
        // fe user group simulation:
1216 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1217 2
        if ($uGs) {
1218
            $parameters['feUserGroupList'] = $uGs;
1219
        }
1220
1221
        // Setting processing instructions
1222 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1223 2
        if (is_array($subCfg['procInstrParams.'])) {
1224 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1225
        }
1226
1227
        // Possible TypoScript Template Parents
1228 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1229
1230
        // Compile value array:
1231 2
        $parameters_serialized = serialize($parameters);
1232
        $fieldArray = [
1233 2
            'page_id' => intval($id),
1234 2
            'parameters' => $parameters_serialized,
1235 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1236 2
            'configuration_hash' => $configurationHash,
1237 2
            'scheduled' => $tstamp,
1238 2
            'exec_time' => 0,
1239 2
            'set_id' => intval($this->setID),
1240 2
            'result_data' => '',
1241 2
            'configuration' => $subCfg['key'],
1242
        ];
1243
1244 2
        if ($this->registerQueueEntriesInternallyOnly) {
1245
            //the entries will only be registered and not stored to the database
1246
            $this->queueEntries[] = $fieldArray;
1247
        } else {
1248 2
            if (!$skipInnerDuplicationCheck) {
1249
                // check if there is already an equal entry
1250 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1251
            }
1252
1253 2
            if (empty($rows)) {
1254 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1255 2
                $connectionForCrawlerQueue->insert(
1256 2
                    'tx_crawler_queue',
1257 2
                    $fieldArray
1258
                );
1259 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1260 2
                $rows[] = $uid;
1261 2
                $urlAdded = true;
1262 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1263
            } else {
1264
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1265
            }
1266
        }
1267
1268 2
        return $urlAdded;
1269
    }
1270
1271
    /**
1272
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1273
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1274
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1275
     *
1276
     * @param int $tstamp
1277
     * @param array $fieldArray
1278
     *
1279
     * @return array
1280
     *
1281
     * TODO: Write Functional Tests
1282
     */
1283 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1284
    {
1285 2
        $rows = [];
1286
1287 2
        $currentTime = $this->getCurrentTime();
1288
1289 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1290
        $queryBuilder
1291 2
            ->select('qid')
1292 2
            ->from('tx_crawler_queue');
1293
        //if this entry is scheduled with "now"
1294 2
        if ($tstamp <= $currentTime) {
1295
            if ($this->extensionSettings['enableTimeslot']) {
1296
                $timeBegin = $currentTime - 100;
1297
                $timeEnd = $currentTime + 100;
1298
                $queryBuilder
1299
                    ->where(
1300
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1301
                    )
1302
                    ->orWhere(
1303
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1304
                    );
1305
            } else {
1306
                $queryBuilder
1307
                    ->where(
1308
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1309
                    );
1310
            }
1311 2
        } elseif ($tstamp > $currentTime) {
1312
            //entry with a timestamp in the future need to have the same schedule time
1313
            $queryBuilder
1314 2
                ->where(
1315 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1316
                );
1317
        }
1318
1319
        $statement = $queryBuilder
1320 2
            ->andWhere('exec_time != 0')
1321 2
            ->andWhere('process_id != 0')
1322 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1323 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1324 2
            ->execute();
1325
1326 2
        while ($row = $statement->fetch()) {
1327
            $rows[] = $row['qid'];
1328
        }
1329
1330 2
        return $rows;
1331
    }
1332
1333
    /**
1334
     * Returns the current system time
1335
     *
1336
     * @return int
1337
     */
1338
    public function getCurrentTime()
1339
    {
1340
        return time();
1341
    }
1342
1343
    /************************************
1344
     *
1345
     * URL reading
1346
     *
1347
     ************************************/
1348
1349
    /**
1350
     * Read URL for single queue entry
1351
     *
1352
     * @param integer $queueId
1353
     * @param boolean $force If set, will process even if exec_time has been set!
1354
     * @return integer
1355
     */
1356
    public function readUrl($queueId, $force = false)
1357
    {
1358
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1359
        $ret = 0;
1360
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1361
        // Get entry:
1362
        $queryBuilder
1363
            ->select('*')
1364
            ->from('tx_crawler_queue')
1365
            ->where(
1366
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1367
            );
1368
        if (!$force) {
1369
            $queryBuilder
1370
                ->andWhere('exec_time = 0')
1371
                ->andWhere('process_scheduled > 0');
1372
        }
1373
        $queueRec = $queryBuilder->execute()->fetch();
1374
1375
        if (!is_array($queueRec)) {
1376
            return;
1377
        }
1378
1379
        $parameters = unserialize($queueRec['parameters']);
1380
        if ($parameters['rootTemplatePid']) {
1381
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1382
        } else {
1383
            $this->logger->warning(
1384
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1385
            );
1386
        }
1387
1388
        SignalSlotUtility::emitSignal(
1389
            __CLASS__,
1390
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1391
            [$queueId, &$queueRec]
1392
        );
1393
1394
        // Set exec_time to lock record:
1395
        $field_array = ['exec_time' => $this->getCurrentTime()];
1396
1397
        if (isset($this->processID)) {
1398
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1399
            $field_array['process_id_completed'] = $this->processID;
1400
        }
1401
1402
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1403
            ->update(
1404
                'tx_crawler_queue',
1405
                $field_array,
1406
                [ 'qid' => (int)$queueId ]
1407
            );
1408
1409
        $result = $this->readUrl_exec($queueRec);
1410
        $resultData = unserialize($result['content']);
1411
1412
        //atm there's no need to point to specific pollable extensions
1413
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1414
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1415
                // only check the success value if the instruction is runnig
1416
                // it is important to name the pollSuccess key same as the procInstructions key
1417
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1418
                    $pollable,
1419
                    $resultData['parameters']['procInstructions']
1420
                )
1421
                ) {
1422
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1423
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1424
                    }
1425
                }
1426
            }
1427
        }
1428
1429
        // Set result in log which also denotes the end of the processing of this entry.
1430
        $field_array = ['result_data' => serialize($result)];
1431
1432
        SignalSlotUtility::emitSignal(
1433
            __CLASS__,
1434
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1435
            [$queueId, &$field_array]
1436
        );
1437
1438
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1439
            ->update(
1440
                'tx_crawler_queue',
1441
                $field_array,
1442
                [ 'qid' => (int)$queueId ]
1443
            );
1444
1445
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1446
        return $ret;
1447
    }
1448
1449
    /**
1450
     * Read URL for not-yet-inserted log-entry
1451
     *
1452
     * @param array $field_array Queue field array,
1453
     *
1454
     * @return string
1455
     */
1456
    public function readUrlFromArray($field_array)
1457
    {
1458
1459
            // Set exec_time to lock record:
1460
        $field_array['exec_time'] = $this->getCurrentTime();
1461
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1462
        $connectionForCrawlerQueue->insert(
1463
            'tx_crawler_queue',
1464
            $field_array
1465
        );
1466
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1467
1468
        $result = $this->readUrl_exec($field_array);
1469
1470
        // Set result in log which also denotes the end of the processing of this entry.
1471
        $field_array = ['result_data' => serialize($result)];
1472
1473
        SignalSlotUtility::emitSignal(
1474
            __CLASS__,
1475
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1476
            [$queueId, &$field_array]
1477
        );
1478
1479
        $connectionForCrawlerQueue->update(
1480
            'tx_crawler_queue',
1481
            $field_array,
1482
            ['qid' => $queueId]
1483
        );
1484
1485
        return $result;
1486
    }
1487
1488
    /**
1489
     * Read URL for a queue record
1490
     *
1491
     * @param array $queueRec Queue record
1492
     * @return string
1493
     */
1494
    public function readUrl_exec($queueRec)
1495
    {
1496
        // Decode parameters:
1497
        $parameters = unserialize($queueRec['parameters']);
1498
        $result = 'ERROR';
1499
        if (is_array($parameters)) {
1500
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1501
                $objRef = $parameters['_CALLBACKOBJ'];
1502
                $callBackObj = GeneralUtility::makeInstance($objRef);
1503
                if (is_object($callBackObj)) {
1504
                    unset($parameters['_CALLBACKOBJ']);
1505
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1506
                } else {
1507
                    $result = ['content' => 'No object: ' . $objRef];
1508
                }
1509
            } else { // Regular FE request:
1510
1511
                // Prepare:
1512
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1513
1514
                // Get result:
1515
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1516
1517
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1518
            }
1519
        }
1520
1521
        return $result;
1522
    }
1523
1524
    /**
1525
     * Gets the content of a URL.
1526
     *
1527
     * @param string $originalUrl URL to read
1528
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1529
     * @param integer $timeout Timeout time
1530
     * @param integer $recursion Recursion limiter for 302 redirects
1531
     * @return array|boolean
1532
     */
1533 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1534
    {
1535 2
        if (!$recursion) {
1536
            return false;
1537
        }
1538
1539
        // Parse URL, checking for scheme:
1540 2
        $url = parse_url($originalUrl);
1541
1542 2
        if ($url === false) {
1543
            $this->logger->debug(
1544
                sprintf('Could not parse_url() for string "%s"', $url),
1545
                ['crawlerId' => $crawlerId]
1546
            );
1547
            return false;
1548
        }
1549
1550 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1551
            $this->logger->debug(
1552
                sprintf('Scheme does not match for url "%s"', $url),
1553
                ['crawlerId' => $crawlerId]
1554
            );
1555
            return false;
1556
        }
1557
1558
        // direct request
1559 2
        if ($this->extensionSettings['makeDirectRequests']) {
1560 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1561 2
            return $result;
1562
        }
1563
1564
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1565
1566
        // thanks to Pierrick Caillon for adding proxy support
1567
        $rurl = $url;
1568
1569
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1570
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1571
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1572
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1573
        }
1574
1575
        $host = $rurl['host'];
1576
1577
        if ($url['scheme'] == 'https') {
1578
            $host = 'ssl://' . $host;
1579
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1580
        } else {
1581
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1582
        }
1583
1584
        $startTime = microtime(true);
1585
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1586
1587
        if (!$fp) {
1588
            $this->logger->debug(
1589
                sprintf('Error while opening "%s"', $url),
1590
                ['crawlerId' => $crawlerId]
1591
            );
1592
            return false;
1593
        } else {
1594
            // Request message:
1595
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1596
            fputs($fp, $msg);
1597
1598
            // Read response:
1599
            $d = $this->getHttpResponseFromStream($fp);
1600
            fclose($fp);
1601
1602
            $time = microtime(true) - $startTime;
1603
            $this->logger->info($originalUrl . ' ' . $time);
1604
1605
            // Implode content and headers:
1606
            $result = [
1607
                'request' => $msg,
1608
                'headers' => implode('', $d['headers']),
1609
                'content' => implode('', (array)$d['content'])
1610
            ];
1611
1612
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1613
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1614
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1615
1616
                if (is_array($newRequestUrl)) {
1617
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1618
                } else {
1619
                    $this->logger->debug(
1620
                        sprintf('Error while opening "%s"', $url),
1621
                        ['crawlerId' => $crawlerId]
1622
                    );
1623
                    return false;
1624
                }
1625
            }
1626
1627
            return $result;
1628
        }
1629
    }
1630
1631
    /**
1632
     * Gets the base path of the website frontend.
1633
     * (e.g. if you call http://mydomain.com/cms/index.php in
1634
     * the browser the base path is "/cms/")
1635
     *
1636
     * @return string Base path of the website frontend
1637
     */
1638
    protected function getFrontendBasePath()
1639
    {
1640
        $frontendBasePath = '/';
1641
1642
        // Get the path from the extension settings:
1643
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1644
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1645
        // If empty, try to use config.absRefPrefix:
1646
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1647
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1648
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1649
        } elseif (!Environment::isCli()) {
1650
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1651
        }
1652
1653
        // Base path must be '/<pathSegements>/':
1654
        if ($frontendBasePath !== '/') {
1655
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1656
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1657
        }
1658
1659
        return $frontendBasePath;
1660
    }
1661
1662
    /**
1663
     * Executes a shell command and returns the outputted result.
1664
     *
1665
     * @param string $command Shell command to be executed
1666
     * @return string Outputted result of the command execution
1667
     */
1668
    protected function executeShellCommand($command)
1669
    {
1670
        return shell_exec($command);
1671
    }
1672
1673
    /**
1674
     * Reads HTTP response from the given stream.
1675
     *
1676
     * @param  resource $streamPointer  Pointer to connection stream.
1677
     * @return array                    Associative array with the following items:
1678
     *                                  headers <array> Response headers sent by server.
1679
     *                                  content <array> Content, with each line as an array item.
1680
     */
1681 1
    protected function getHttpResponseFromStream($streamPointer)
1682
    {
1683 1
        $response = ['headers' => [], 'content' => []];
1684
1685 1
        if (is_resource($streamPointer)) {
1686
            // read headers
1687 1
            while ($line = fgets($streamPointer, '2048')) {
1688 1
                $line = trim($line);
1689 1
                if ($line !== '') {
1690 1
                    $response['headers'][] = $line;
1691
                } else {
1692 1
                    break;
1693
                }
1694
            }
1695
1696
            // read content
1697 1
            while ($line = fgets($streamPointer, '2048')) {
1698 1
                $response['content'][] = $line;
1699
            }
1700
        }
1701
1702 1
        return $response;
1703
    }
1704
1705
    /**
1706
     * Builds HTTP request headers.
1707
     *
1708
     * @param array $url
1709
     * @param string $crawlerId
1710
     *
1711
     * @return array
1712
     */
1713 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1714
    {
1715 6
        $reqHeaders = [];
1716 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1717 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1718 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1719 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1720
        }
1721 6
        $reqHeaders[] = 'Connection: close';
1722 6
        if ($url['user'] != '') {
1723 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1724
        }
1725 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1726 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1727 6
        return $reqHeaders;
1728
    }
1729
1730
    /**
1731
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1732
     *
1733
     * @param array $headers HTTP Header
1734
     * @param string $user HTTP Auth. User
1735
     * @param string $pass HTTP Auth. Password
1736
     * @return bool|string
1737
     */
1738 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1739
    {
1740 12
        $header = [];
1741 12
        if (!is_array($headers)) {
1742 1
            return false;
1743
        }
1744 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1745 2
            return false;
1746
        }
1747
1748 9
        foreach ($headers as $hl) {
1749 9
            $tmp = explode(": ", $hl);
1750 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1751 9
            if (trim($tmp[0]) == 'Location') {
1752 6
                break;
1753
            }
1754
        }
1755 9
        if (!array_key_exists('Location', $header)) {
1756 3
            return false;
1757
        }
1758
1759 6
        if ($user != '') {
1760 3
            if (!($tmp = parse_url($header['Location']))) {
1761 1
                return false;
1762
            }
1763 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1764 2
            if ($tmp['query'] != '') {
1765 2
                $newUrl .= '?' . $tmp['query'];
1766
            }
1767
        } else {
1768 3
            $newUrl = $header['Location'];
1769
        }
1770 5
        return $newUrl;
1771
    }
1772
1773
    /**************************
1774
     *
1775
     * tslib_fe hooks:
1776
     *
1777
     **************************/
1778
1779
    /**
1780
     * Initialization hook (called after database connection)
1781
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1782
     *
1783
     * @param array $params Parameters from frontend
1784
     * @param object $ref TSFE object (reference under PHP5)
1785
     * @return void
1786
     *
1787
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1788
     * FIXME: I think this can be removed. (TNM)
1789
     */
1790
    public function fe_init(&$params, $ref)
1791
    {
1792
        // Authenticate crawler request:
1793
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1794
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1795
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1796
1797
            $queueRec = $queryBuilder
1798
                ->select('*')
1799
                ->from('tx_crawler_queue')
1800
                ->where(
1801
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1802
                )
1803
                ->execute()
1804
                ->fetch();
1805
1806
            // If a crawler record was found and hash was matching, set it up:
1807
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1808
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1809
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1810
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1811
            } else {
1812
                die('No crawler entry found!');
1813
            }
1814
        }
1815
    }
1816
1817
    /*****************************
1818
     *
1819
     * Compiling URLs to crawl - tools
1820
     *
1821
     *****************************/
1822
1823
    /**
1824
     * @param integer $id Root page id to start from.
1825
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1826
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1827
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1828
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1829
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1830
     * @param array $incomingProcInstructions Array of processing instructions
1831
     * @param array $configurationSelection Array of configuration keys
1832
     * @return string
1833
     */
1834
    public function getPageTreeAndUrls(
1835
        $id,
1836
        $depth,
1837
        $scheduledTime,
1838
        $reqMinute,
1839
        $submitCrawlUrls,
1840
        $downloadCrawlUrls,
1841
        array $incomingProcInstructions,
1842
        array $configurationSelection
1843
    ) {
1844
        $this->scheduledTime = $scheduledTime;
1845
        $this->reqMinute = $reqMinute;
1846
        $this->submitCrawlUrls = $submitCrawlUrls;
1847
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1848
        $this->incomingProcInstructions = $incomingProcInstructions;
1849
        $this->incomingConfigurationSelection = $configurationSelection;
1850
1851
        $this->duplicateTrack = [];
1852
        $this->downloadUrls = [];
1853
1854
        // Drawing tree:
1855
        /* @var PageTreeView $tree */
1856
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1857
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1858
        $tree->init('AND ' . $perms_clause);
1859
1860
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1861
        if (is_array($pageInfo)) {
1862
            // Set root row:
1863
            $tree->tree[] = [
1864
                'row' => $pageInfo,
1865
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1866
            ];
1867
        }
1868
1869
        // Get branch beneath:
1870
        if ($depth) {
1871
            $tree->getTree($id, $depth, '');
1872
        }
1873
1874
        // Traverse page tree:
1875
        $code = '';
1876
1877
        foreach ($tree->tree as $data) {
1878
            $this->MP = false;
1879
1880
            // recognize mount points
1881
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1882
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1883
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1884
                $mountpage = $queryBuilder
1885
                    ->select('*')
1886
                    ->from('pages')
1887
                    ->where(
1888
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1889
                    )
1890
                    ->execute()
1891
                    ->fetchAll();
1892
                $queryBuilder->getRestrictions()->reset();
1893
1894
                // fetch mounted pages
1895
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1896
1897
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1898
                $mountTree->init('AND ' . $perms_clause);
1899
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1900
1901
                foreach ($mountTree->tree as $mountData) {
1902
                    $code .= $this->drawURLs_addRowsForPage(
1903
                        $mountData['row'],
1904
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1905
                    );
1906
                }
1907
1908
                // replace page when mount_pid_ol is enabled
1909
                if ($mountpage[0]['mount_pid_ol']) {
1910
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1911
                } else {
1912
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1913
                    $this->MP = false;
1914
                }
1915
            }
1916
1917
            $code .= $this->drawURLs_addRowsForPage(
1918
                $data['row'],
1919
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1920
            );
1921
        }
1922
1923
        return $code;
1924
    }
1925
1926
    /**
1927
     * Expands exclude string
1928
     *
1929
     * @param string $excludeString Exclude string
1930
     * @return array
1931
     */
1932 1
    public function expandExcludeString($excludeString)
1933
    {
1934
        // internal static caches;
1935 1
        static $expandedExcludeStringCache;
1936 1
        static $treeCache;
1937
1938 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1939 1
            $pidList = [];
1940
1941 1
            if (!empty($excludeString)) {
1942
                /** @var PageTreeView $tree */
1943
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1944
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1945
1946
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1947
1948
                foreach ($excludeParts as $excludePart) {
1949
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1950
1951
                    // default is "page only" = "depth=0"
1952
                    if (empty($depth)) {
1953
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1954
                    }
1955
1956
                    $pidList[] = $pid;
1957
1958
                    if ($depth > 0) {
1959
                        if (empty($treeCache[$pid][$depth])) {
1960
                            $tree->reset();
1961
                            $tree->getTree($pid, $depth);
1962
                            $treeCache[$pid][$depth] = $tree->tree;
1963
                        }
1964
1965
                        foreach ($treeCache[$pid][$depth] as $data) {
1966
                            $pidList[] = $data['row']['uid'];
1967
                        }
1968
                    }
1969
                }
1970
            }
1971
1972 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1973
        }
1974
1975 1
        return $expandedExcludeStringCache[$excludeString];
1976
    }
1977
1978
    /**
1979
     * Create the rows for display of the page tree
1980
     * For each page a number of rows are shown displaying GET variable configuration
1981
     *
1982
     * @param    array        Page row
1983
     * @param    string        Page icon and title for row
1984
     * @return    string        HTML <tr> content (one or more)
1985
     */
1986
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1987
    {
1988
        $skipMessage = '';
1989
1990
        // Get list of configurations
1991
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1992
1993
        if (!empty($this->incomingConfigurationSelection)) {
1994
            // remove configuration that does not match the current selection
1995
            foreach ($configurations as $confKey => $confArray) {
1996
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1997
                    unset($configurations[$confKey]);
1998
                }
1999
            }
2000
        }
2001
2002
        // Traverse parameter combinations:
2003
        $c = 0;
2004
        $content = '';
2005
        if (!empty($configurations)) {
2006
            foreach ($configurations as $confKey => $confArray) {
2007
2008
                    // Title column:
2009
                if (!$c) {
2010
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2011
                } else {
2012
                    $titleClm = '';
2013
                }
2014
2015
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2016
2017
                        // URL list:
2018
                    $urlList = $this->urlListFromUrlArray(
2019
                        $confArray,
2020
                        $pageRow,
2021
                        $this->scheduledTime,
2022
                        $this->reqMinute,
2023
                        $this->submitCrawlUrls,
2024
                        $this->downloadCrawlUrls,
2025
                        $this->duplicateTrack,
2026
                        $this->downloadUrls,
2027
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2028
                    );
2029
2030
                    // Expanded parameters:
2031
                    $paramExpanded = '';
2032
                    $calcAccu = [];
2033
                    $calcRes = 1;
2034
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2035
                        $paramExpanded .= '
2036
                            <tr>
2037
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2038
                                                '(' . count($gVal) . ')' .
2039
                                                '</td>
2040
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2041
                            </tr>
2042
                        ';
2043
                        $calcRes *= count($gVal);
2044
                        $calcAccu[] = count($gVal);
2045
                    }
2046
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2047
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2048
2049
                    // Options
2050
                    $optionValues = '';
2051
                    if ($confArray['subCfg']['userGroups']) {
2052
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2053
                    }
2054
                    if ($confArray['subCfg']['baseUrl']) {
2055
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2056
                    }
2057
                    if ($confArray['subCfg']['procInstrFilter']) {
2058
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2059
                    }
2060
2061
                    // Compile row:
2062
                    $content .= '
2063
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2064
                            ' . $titleClm . '
2065
                            <td>' . htmlspecialchars($confKey) . '</td>
2066
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2067
                            <td>' . $paramExpanded . '</td>
2068
                            <td nowrap="nowrap">' . $urlList . '</td>
2069
                            <td nowrap="nowrap">' . $optionValues . '</td>
2070
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2071
                        </tr>';
2072
                } else {
2073
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2074
                            ' . $titleClm . '
2075
                            <td>' . htmlspecialchars($confKey) . '</td>
2076
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2077
                        </tr>';
2078
                }
2079
2080
                $c++;
2081
            }
2082
        } else {
2083
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2084
2085
            // Compile row:
2086
            $content .= '
2087
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2088
                    <td>' . $pageTitleAndIcon . '</td>
2089
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2090
                </tr>';
2091
        }
2092
2093
        return $content;
2094
    }
2095
2096
    /*****************************
2097
     *
2098
     * CLI functions
2099
     *
2100
     *****************************/
2101
2102
    /**
2103
     * Running the functionality of the CLI (crawling URLs from queue)
2104
     *
2105
     * @param int $countInARun
2106
     * @param int $sleepTime
2107
     * @param int $sleepAfterFinish
2108
     * @return string
2109
     */
2110
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2111
    {
2112
        $result = 0;
2113
        $counter = 0;
2114
2115
        // First, run hooks:
2116
        $this->CLI_runHooks();
2117
2118
        // Clean up the queue
2119
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2120
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2121
2122
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2123
            $del = $queryBuilderDelete
2124
                ->delete($this->tableName)
2125
                ->where(
2126
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2127
                )->execute();
2128
2129
            if (false === $del) {
2130
                $this->logger->info(
2131
                    'Records could not be deleted.'
2132
                );
2133
            }
2134
        }
2135
2136
        // Select entries:
2137
        //TODO Shouldn't this reside within the transaction?
2138
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2139
        $rows = $queryBuilderSelect
2140
            ->select('qid', 'scheduled')
2141
            ->from('tx_crawler_queue')
2142
            ->where(
2143
                $queryBuilderSelect->expr()->eq('exec_time', 0),
2144
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
2145
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
2146
            )
2147
            ->orderBy('scheduled')
2148
            ->addOrderBy('qid')
2149
            ->setMaxResults($countInARun)
2150
            ->execute()
2151
            ->fetchAll();
2152
2153
        if (!empty($rows)) {
2154
            $quidList = [];
2155
2156
            foreach ($rows as $r) {
2157
                $quidList[] = $r['qid'];
2158
            }
2159
2160
            $processId = $this->CLI_buildProcessId();
2161
2162
            //reserve queue entries for process
2163
2164
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2165
            //TODO make sure we're not taking assigned queue-entires
2166
2167
            //save the number of assigned queue entrys to determine who many have been processed later
2168
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2169
            $numberOfAffectedRows = $queryBuilderUpdate
2170
                ->update('tx_crawler_queue')
2171
                ->where(
2172
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
2173
                )
2174
                ->set('process_scheduled', $this->getCurrentTime())
2175
                ->set('process_id', $queryBuilderUpdate->createNamedParameter($processId, \PDO::PARAM_STR))
2176
                ->execute();
2177
2178
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2179
                ->update(
2180
                    'tx_crawler_process',
2181
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2182
                    [ 'process_id' => (int) $processId ]
2183
                );
2184
2185
            if ($numberOfAffectedRows == count($quidList)) {
2186
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2187
            } else {
2188
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2189
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2190
                return ($result | self::CLI_STATUS_ABORTED);
2191
            }
2192
2193
            foreach ($rows as $r) {
2194
                $result |= $this->readUrl($r['qid']);
2195
2196
                $counter++;
2197
                usleep(intval($sleepTime)); // Just to relax the system
2198
2199
                // if during the start and the current read url the cli has been disable we need to return from the function
2200
                // mark the process NOT as ended.
2201
                if ($this->getDisabled()) {
2202
                    return ($result | self::CLI_STATUS_ABORTED);
2203
                }
2204
2205
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2206
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2207
2208
                    //TODO might need an additional returncode
2209
                    $result |= self::CLI_STATUS_ABORTED;
2210
                    break; //possible timeout
2211
                }
2212
            }
2213
2214
            sleep(intval($sleepAfterFinish));
2215
2216
            $msg = 'Rows: ' . $counter;
2217
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2218
        } else {
2219
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2220
        }
2221
2222
        if ($counter > 0) {
2223
            $result |= self::CLI_STATUS_PROCESSED;
2224
        }
2225
2226
        return $result;
2227
    }
2228
2229
    /**
2230
     * Activate hooks
2231
     *
2232
     * @return void
2233
     */
2234
    public function CLI_runHooks()
2235
    {
2236
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
2237
            $hookObj = GeneralUtility::makeInstance($objRef);
2238
            if (is_object($hookObj)) {
2239
                $hookObj->crawler_init($this);
2240
            }
2241
        }
2242
    }
2243
2244
    /**
2245
     * Try to acquire a new process with the given id
2246
     * also performs some auto-cleanup for orphan processes
2247
     * @todo preemption might not be the most elegant way to clean up
2248
     *
2249
     * @param string $id identification string for the process
2250
     * @return boolean
2251
     */
2252
    public function CLI_checkAndAcquireNewProcess($id)
2253
    {
2254
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2255
        $ret = true;
2256
2257
        $systemProcessId = getmypid();
2258
        if ($systemProcessId < 1) {
2259
            return false;
2260
        }
2261
2262
        $processCount = 0;
2263
        $orphanProcesses = [];
2264
2265
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2266
2267
        $statement = $queryBuilder
2268
            ->select('process_id', 'ttl')
2269
            ->from('tx_crawler_process')
2270
            ->where(
2271
                'active = 1 AND deleted = 0'
2272
            )
2273
            ->execute();
2274
2275
        $currentTime = $this->getCurrentTime();
2276
2277
        while ($row = $statement->fetch()) {
2278
            if ($row['ttl'] < $currentTime) {
2279
                $orphanProcesses[] = $row['process_id'];
2280
            } else {
2281
                $processCount++;
2282
            }
2283
        }
2284
2285
        // if there are less than allowed active processes then add a new one
2286
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2287
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2288
2289
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2290
                'tx_crawler_process',
2291
                [
2292
                    'process_id' => $id,
2293
                    'active' => 1,
2294
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2295
                    'system_process_id' => $systemProcessId
2296
                ]
2297
            );
2298
        } else {
2299
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2300
            $ret = false;
2301
        }
2302
2303
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2304
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2305
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2306
2307
        return $ret;
2308
    }
2309
2310
    /**
2311
     * Release a process and the required resources
2312
     *
2313
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2314
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2315
     * @return boolean
2316
     */
2317
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2318
    {
2319
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2320
2321
        if (!is_array($releaseIds)) {
2322
            $releaseIds = [$releaseIds];
2323
        }
2324
2325
        if (empty($releaseIds)) {
2326
            return false;   //nothing to release
2327
        }
2328
2329
        if (!$withinLock) {
2330
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2331
        }
2332
2333
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2334
        // this ensures that a single process can't mess up the entire process table
2335
2336
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2337
2338
        $queryBuilder
2339
        ->update('tx_crawler_queue', 'q')
2340
        ->where(
2341
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2342
        )
2343
        ->set('q.process_scheduled', 0)
2344
        ->set('q.process_id', '')
2345
        ->execute();
2346
2347
        // FIXME: Not entirely sure that this is equivalent to the previous version
2348
        $queryBuilder->resetQueryPart('set');
2349
2350
        $queryBuilder
2351
            ->update('tx_crawler_process')
2352
            ->where(
2353
                $queryBuilder->expr()->eq('active', 0),
2354
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2355
            )
2356
            ->set('system_process_id', 0)
2357
            ->execute();
2358
        // previous version for reference
2359
        /*
2360
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2361
            'tx_crawler_process',
2362
            'active=0 AND deleted=0
2363
            AND NOT EXISTS (
2364
                SELECT * FROM tx_crawler_queue
2365
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2366
                AND tx_crawler_queue.exec_time = 0
2367
            )',
2368
            [
2369
                'deleted' => '1',
2370
                'system_process_id' => 0
2371
            ]
2372
        );*/
2373
        // mark all requested processes as non-active
2374
        $queryBuilder
2375
            ->update('tx_crawler_process')
2376
            ->where(
2377
                'NOT EXISTS (
2378
                SELECT * FROM tx_crawler_queue
2379
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2380
                    AND tx_crawler_queue.exec_time = 0
2381
                )',
2382
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2383
                $queryBuilder->expr()->eq('deleted', 0)
2384
            )
2385
            ->set('active', 0)
2386
            ->execute();
2387
        $queryBuilder->resetQueryPart('set');
2388
        $queryBuilder
2389
            ->update('tx_crawler_queue')
2390
            ->where(
2391
                $queryBuilder->expr()->eq('exec_time', 0),
2392
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2393
            )
2394
            ->set('process_scheduled', 0)
2395
            ->set('process_id', '')
2396
            ->execute();
2397
2398
        if (!$withinLock) {
2399
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2400
        }
2401
2402
        return true;
2403
    }
2404
2405
    /**
2406
     * Check if there are still resources left for the process with the given id
2407
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2408
     *
2409
     * @param  string  identification string for the process
2410
     * @return boolean determines if the process is still active / has resources
2411
     *
2412
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2413
     */
2414 1
    public function CLI_checkIfProcessIsActive($pid)
2415
    {
2416 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2417 1
        $ret = false;
2418
2419
        $statement = $queryBuilder
2420 1
            ->from('tx_crawler_process')
2421 1
            ->select('active')
2422 1
            ->where(
2423 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2424
            )
2425 1
            ->orderBy('ttl')
2426 1
            ->execute();
2427
2428 1
        if ($row = $statement->fetch(0)) {
2429 1
            $ret = intVal($row['active']) == 1;
2430
        }
2431
2432 1
        return $ret;
2433
    }
2434
2435
    /**
2436
     * Create a unique Id for the current process
2437
     *
2438
     * @return string  the ID
2439
     */
2440 2
    public function CLI_buildProcessId()
2441
    {
2442 2
        if (!$this->processID) {
2443 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2444
        }
2445 2
        return $this->processID;
2446
    }
2447
2448
    /**
2449
     * @param bool $get_as_float
2450
     *
2451
     * @return mixed
2452
     */
2453
    protected function microtime($get_as_float = false)
2454
    {
2455
        return microtime($get_as_float);
2456
    }
2457
2458
    /**
2459
     * Prints a message to the stdout (only if debug-mode is enabled)
2460
     *
2461
     * @param  string $msg  the message
2462
     */
2463
    public function CLI_debug($msg)
2464
    {
2465
        if (intval($this->extensionSettings['processDebug'])) {
2466
            echo $msg . "\n";
2467
            flush();
2468
        }
2469
    }
2470
2471
    /**
2472
     * Get URL content by making direct request to TYPO3.
2473
     *
2474
     * @param  string $url          Page URL
2475
     * @param  int    $crawlerId    Crawler-ID
2476
     * @return array
2477
     */
2478 2
    protected function sendDirectRequest($url, $crawlerId)
2479
    {
2480 2
        $parsedUrl = parse_url($url);
2481 2
        if (!is_array($parsedUrl)) {
2482
            return [];
2483
        }
2484
2485 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2486
2487 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2488 2
        $cmd .= ' ';
2489 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2490 2
        $cmd .= ' ';
2491 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2492 2
        $cmd .= ' ';
2493 2
        $cmd .= escapeshellarg($url);
2494 2
        $cmd .= ' ';
2495 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2496
2497 2
        $startTime = microtime(true);
2498 2
        $content = $this->executeShellCommand($cmd);
2499 2
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
2500
2501
        $result = [
2502 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2503 2
            'headers' => '',
2504 2
            'content' => $content
2505
        ];
2506
2507 2
        return $result;
2508
    }
2509
2510
    /**
2511
     * Cleans up entries that stayed for too long in the queue. These are:
2512
     * - processed entries that are over 1.5 days in age
2513
     * - scheduled entries that are over 7 days old
2514
     *
2515
     * @return void
2516
     */
2517
    public function cleanUpOldQueueEntries()
2518
    {
2519
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2520
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2521
2522
        $now = time();
2523
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2524
        $this->flushQueue($condition);
2525
    }
2526
2527
    /**
2528
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2529
     *
2530
     * @param int $pageId
2531
     * @return void
2532
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2533
     * @throws \TYPO3\CMS\Core\Http\ImmediateResponseException
2534
     */
2535
    protected function initTSFE(int $pageId): void
2536
    {
2537
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(
2538
            TypoScriptFrontendController::class,
2539
            null,
2540
            $pageId,
2541
            0
2542
        );
2543
        $GLOBALS['TSFE']->initFEuser();
2544
        $GLOBALS['TSFE']->determineId();
2545
        $GLOBALS['TSFE']->getConfigArray();
2546
        $GLOBALS['TSFE']->settingLanguage();
2547
        $GLOBALS['TSFE']->settingLocale();
2548
        $GLOBALS['TSFE']->newCObj();
2549
    }
2550
2551
    /**
2552
     * Returns a md5 hash generated from a serialized configuration array.
2553
     *
2554
     * @param array $configuration
2555
     *
2556
     * @return string
2557
     */
2558 7
    protected function getConfigurationHash(array $configuration)
2559
    {
2560 7
        unset($configuration['paramExpanded']);
2561 7
        unset($configuration['URLs']);
2562 7
        return md5(serialize($configuration));
2563
    }
2564
}
2565