Completed
Push — typo3v9 ( 7f7f07...70150c )
by Tomas Norre
06:02
created

CrawlerController::getPageTreeAndUrls()   B

Complexity

Conditions 7
Paths 16

Size

Total Lines 91

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 0
Metric Value
cc 7
nc 16
nop 8
dl 0
loc 91
ccs 0
cts 47
cp 0
crap 56
rs 7.263
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use Psr\Log\LoggerAwareInterface;
35
use Psr\Log\LoggerAwareTrait;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Core\Environment;
40
use TYPO3\CMS\Core\Database\Connection;
41
use TYPO3\CMS\Core\Database\ConnectionPool;
42
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
43
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
44
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
45
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
46
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
47
use TYPO3\CMS\Core\Utility\DebugUtility;
48
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
49
use TYPO3\CMS\Core\Utility\GeneralUtility;
50
use TYPO3\CMS\Core\Utility\MathUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController implements LoggerAwareInterface
61
{
62
    use LoggerAwareTrait;
63
64
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
65
    const CLI_STATUS_REMAIN = 1; //queue not empty
66
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
67
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
68
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
69
70
    /**
71
     * @var integer
72
     */
73
    public $setID = 0;
74
75
    /**
76
     * @var string
77
     */
78
    public $processID = '';
79
80
    /**
81
     * @var array
82
     */
83
    public $duplicateTrack = [];
84
85
    /**
86
     * @var array
87
     */
88
    public $downloadUrls = [];
89
90
    /**
91
     * @var array
92
     */
93
    public $incomingProcInstructions = [];
94
95
    /**
96
     * @var array
97
     */
98
    public $incomingConfigurationSelection = [];
99
100
    /**
101
     * @var bool
102
     */
103
    public $registerQueueEntriesInternallyOnly = false;
104
105
    /**
106
     * @var array
107
     */
108
    public $queueEntries = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $urlList = [];
114
115
    /**
116
     * @var array
117
     */
118
    public $extensionSettings = [];
119
120
    /**
121
     * Mount Point
122
     *
123
     * @var boolean
124
     */
125
    public $MP = false;
126
127
    /**
128
     * @var string
129
     */
130
    protected $processFilename;
131
132
    /**
133
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
134
     *
135
     * @var string
136
     */
137
    protected $accessMode;
138
139
    /**
140
     * @var BackendUserAuthentication
141
     */
142
    private $backendUser;
143
144
    /**
145
     * @var integer
146
     */
147
    private $scheduledTime = 0;
148
149
    /**
150
     * @var integer
151
     */
152
    private $reqMinute = 0;
153
154
    /**
155
     * @var bool
156
     */
157
    private $submitCrawlUrls = false;
158
159
    /**
160
     * @var bool
161
     */
162
    private $downloadCrawlUrls = false;
163
164
    /**
165
     * @var QueueRepository
166
     */
167
    protected $queueRepository;
168
169
    /**
170
     * @var ProcessRepository
171
     */
172
    protected $processRepository;
173
174
    /**
175
     * @var string
176
     */
177
    protected $tableName = 'tx_crawler_queue';
178
179
    /**
180
     * Method to set the accessMode can be gui, cli or cli_im
181
     *
182
     * @return string
183
     */
184 1
    public function getAccessMode()
185
    {
186 1
        return $this->accessMode;
187
    }
188
189
    /**
190
     * @param string $accessMode
191
     */
192 1
    public function setAccessMode($accessMode)
193
    {
194 1
        $this->accessMode = $accessMode;
195 1
    }
196
197
    /**
198
     * Set disabled status to prevent processes from being processed
199
     *
200
     * @param  bool $disabled (optional, defaults to true)
201
     * @return void
202
     */
203 3
    public function setDisabled($disabled = true)
204
    {
205 3
        if ($disabled) {
206 2
            GeneralUtility::writeFile($this->processFilename, '');
207
        } else {
208 1
            if (is_file($this->processFilename)) {
209 1
                unlink($this->processFilename);
210
            }
211
        }
212 3
    }
213
214
    /**
215
     * Get disable status
216
     *
217
     * @return bool true if disabled
218
     */
219 3
    public function getDisabled()
220
    {
221 3
        return is_file($this->processFilename);
222
    }
223
224
    /**
225
     * @param string $filenameWithPath
226
     *
227
     * @return void
228
     */
229 4
    public function setProcessFilename($filenameWithPath)
230
    {
231 4
        $this->processFilename = $filenameWithPath;
232 4
    }
233
234
    /**
235
     * @return string
236
     */
237 1
    public function getProcessFilename()
238
    {
239 1
        return $this->processFilename;
240
    }
241
242
    /************************************
243
     *
244
     * Getting URLs based on Page TSconfig
245
     *
246
     ************************************/
247
248 31
    public function __construct()
249
    {
250 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
251 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
252 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
253
254 31
        $this->backendUser = $GLOBALS['BE_USER'];
255 31
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
256
257
        /** @var ExtensionConfigurationProvider $configurationProvider */
258 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
259 31
        $settings = $configurationProvider->getExtensionConfiguration();
260 31
        $this->extensionSettings = is_array($settings) ? $settings : [];
261
262
        // set defaults:
263 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
264
            $this->extensionSettings['countInARun'] = 100;
265
        }
266
267 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
268 31
    }
269
270
    /**
271
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
272
     *
273
     * @param array $extensionSettings
274
     * @return void
275
     */
276 9
    public function setExtensionSettings(array $extensionSettings)
277
    {
278 9
        $this->extensionSettings = $extensionSettings;
279 9
    }
280
281
    /**
282
     * Check if the given page should be crawled
283
     *
284
     * @param array $pageRow
285
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
286
     */
287 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
288
    {
289 8
        $skipPage = false;
290 8
        $skipMessage = 'Skipped'; // message will be overwritten later
291
292
        // if page is hidden
293 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
294 8
            if ($pageRow['hidden']) {
295 1
                $skipPage = true;
296 1
                $skipMessage = 'Because page is hidden';
297
            }
298
        }
299
300 8
        if (!$skipPage) {
301 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
302 3
                $skipPage = true;
303 3
                $skipMessage = 'Because doktype is not allowed';
304
            }
305
        }
306
307 8
        if (!$skipPage) {
308 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
309 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
310 1
                    $skipPage = true;
311 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
312 1
                    break;
313
                }
314
            }
315
        }
316
317 8
        if (!$skipPage) {
318
            // veto hook
319 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
320
                $params = [
321
                    'pageRow' => $pageRow
322
                ];
323
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
324
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
325
                if ($veto !== false) {
326
                    $skipPage = true;
327
                    if (is_string($veto)) {
328
                        $skipMessage = $veto;
329
                    } else {
330
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
331
                    }
332
                    // no need to execute other hooks if a previous one return a veto
333
                    break;
334
                }
335
            }
336
        }
337
338 8
        return $skipPage ? $skipMessage : false;
339
    }
340
341
    /**
342
     * Wrapper method for getUrlsForPageId()
343
     * It returns an array of configurations and no urls!
344
     *
345
     * @param array $pageRow Page record with at least dok-type and uid columns.
346
     * @param string $skipMessage
347
     * @return array
348
     * @see getUrlsForPageId()
349
     */
350 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
351
    {
352 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
353
354 4
        if ($message === false) {
355 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
356 3
            $skipMessage = '';
357
        } else {
358 1
            $skipMessage = $message;
359 1
            $res = [];
360
        }
361
362 4
        return $res;
363
    }
364
365
    /**
366
     * This method is used to count if there are ANY unprocessed queue entries
367
     * of a given page_id and the configuration which matches a given hash.
368
     * If there if none, we can skip an inner detail check
369
     *
370
     * @param  int $uid
371
     * @param  string $configurationHash
372
     * @return boolean
373
     */
374 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
375
    {
376 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
377 5
        $noUnprocessedQueueEntriesFound = true;
378
379
        $result = $queryBuilder
380 5
            ->count('*')
381 5
            ->from($this->tableName)
382 5
            ->where(
383 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
384 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
385 5
                $queryBuilder->expr()->eq('exec_time', 0)
386
            )
387 5
            ->execute()
388 5
            ->fetchColumn();
389
390 5
        if ($result) {
391 3
            $noUnprocessedQueueEntriesFound = false;
392
        }
393
394 5
        return $noUnprocessedQueueEntriesFound;
395
    }
396
397
    /**
398
     * Creates a list of URLs from input array (and submits them to queue if asked for)
399
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
400
     *
401
     * @param    array        Information about URLs from pageRow to crawl.
402
     * @param    array        Page row
403
     * @param    integer        Unix time to schedule indexing to, typically time()
404
     * @param    integer        Number of requests per minute (creates the interleave between requests)
405
     * @param    boolean        If set, submits the URLs to queue
406
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
407
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
408
     * @param    array        Array which will be filled with URLS for download if flag is set.
409
     * @param    array        Array of processing instructions
410
     * @return    string        List of URLs (meant for display in backend module)
411
     *
412
     */
413 2
    public function urlListFromUrlArray(
414
        array $vv,
415
        array $pageRow,
416
        $scheduledTime,
417
        $reqMinute,
418
        $submitCrawlUrls,
419
        $downloadCrawlUrls,
420
        array &$duplicateTrack,
421
        array &$downloadUrls,
422
        array $incomingProcInstructions
423
    ) {
424 2
        $urlList = '';
425
426 2
        if (is_array($vv['URLs'])) {
427 2
            $configurationHash = $this->getConfigurationHash($vv);
428 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
429
430 2
            foreach ($vv['URLs'] as $urlQuery) {
431 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
432
433
                    // Calculate cHash:
434 2
                    if ($vv['subCfg']['cHash']) {
435
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
436
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
437
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
438
                    }
439
440
                    // Create key by which to determine unique-ness:
441 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
442 2
                    $urlQuery = 'index.php' . $urlQuery;
443
444
                    // Scheduled time:
445 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
446 2
                    $schTime = floor($schTime / 60) * 60;
447
448 2
                    if (isset($duplicateTrack[$uKey])) {
449
450
                        //if the url key is registered just display it and do not resubmit is
451
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
452
                    } else {
453 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
454 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
455
456 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
457
458
                        // Submit for crawling!
459 2
                        if ($submitCrawlUrls) {
460 2
                            $added = $this->addUrl(
461 2
                                $pageRow['uid'],
462 2
                                $theUrl,
463 2
                                $vv['subCfg'],
464 2
                                $scheduledTime,
465 2
                                $configurationHash,
466 2
                                $skipInnerCheck
467
                            );
468 2
                            if ($added === false) {
469 2
                                $urlList .= ' (Url already existed)';
470
                            }
471
                        } elseif ($downloadCrawlUrls) {
472
                            $downloadUrls[$theUrl] = $theUrl;
473
                        }
474
475 2
                        $urlList .= '<br />';
476
                    }
477 2
                    $duplicateTrack[$uKey] = true;
478
                }
479
            }
480
        } else {
481
            $urlList = 'ERROR - no URL generated';
482
        }
483
484 2
        return $urlList;
485
    }
486
487
    /**
488
     * Returns true if input processing instruction is among registered ones.
489
     *
490
     * @param string $piString PI to test
491
     * @param array $incomingProcInstructions Processing instructions
492
     * @return boolean
493
     */
494 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
495
    {
496 5
        if (empty($incomingProcInstructions)) {
497 1
            return true;
498
        }
499
500 4
        foreach ($incomingProcInstructions as $pi) {
501 4
            if (GeneralUtility::inList($piString, $pi)) {
502 2
                return true;
503
            }
504
        }
505 2
        return false;
506
    }
507
508 2
    public function getPageTSconfigForId($id)
509
    {
510 2
        if (!$this->MP) {
511 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
512
        } else {
513
            list(, $mountPointId) = explode('-', $this->MP);
514
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
515
        }
516
517
        // Call a hook to alter configuration
518 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
519
            $params = [
520
                'pageId' => $id,
521
                'pageTSConfig' => &$pageTSconfig
522
            ];
523
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
524
                GeneralUtility::callUserFunction($userFunc, $params, $this);
525
            }
526
        }
527
528 2
        return $pageTSconfig;
529
    }
530
531
    /**
532
     * This methods returns an array of configurations.
533
     * And no urls!
534
     *
535
     * @param integer $id Page ID
536
     * @return array
537
     */
538 2
    public function getUrlsForPageId($id)
539
    {
540
541
        /**
542
         * Get configuration from tsConfig
543
         */
544
545
        // Get page TSconfig for page ID:
546 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
547
548 2
        $res = [];
549
550 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
551 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
552
553 1
            if (is_array($crawlerCfg['paramSets.'])) {
554 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
555 1
                    if (is_array($values)) {
556 1
                        $key = str_replace('.', '', $key);
557
                        // Sub configuration for a single configuration string:
558 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
559 1
                        $subCfg['key'] = $key;
560
561 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
562 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
                        }
564 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
                        // process configuration if it is not page-specific or if the specific page is the current page:
567 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
568
569
                                // add trailing slash if not present
570 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
571
                                $subCfg['baseUrl'] .= '/';
572
                            }
573
574
                            // Explode, process etc.:
575 1
                            $res[$key] = [];
576 1
                            $res[$key]['subCfg'] = $subCfg;
577 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
578 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
579 1
                            $res[$key]['origin'] = 'pagets';
580
581
                            // recognize MP value
582 1
                            if (!$this->MP) {
583 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
584
                            } else {
585
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
586
                            }
587
                        }
588
                    }
589
                }
590
            }
591
        }
592
593
        /**
594
         * Get configuration from tx_crawler_configuration records
595
         */
596
597
        // get records along the rootline
598 2
        $rootLine = BackendUtility::BEgetRootLine($id);
599
600 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
601
        $queryBuilder
602 2
            ->getRestrictions()->removeAll()
603 2
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
604 2
            ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
605
606 2
        foreach ($rootLine as $page) {
607
            $configurationRecordsForCurrentPage = $queryBuilder
608 2
                ->select('*')
609 2
                ->from('tx_crawler_configuration')
610 2
                ->where(
611 2
                    $queryBuilder->expr()->eq('pid', $page['uid'])
612
                )
613 2
                ->execute()
614 2
                ->fetchAll();
615
616 2
            foreach ($configurationRecordsForCurrentPage ?? [] as $configurationRecord) {
617
618
                    // check access to the configuration record
619 1
                if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
620 1
                    $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
621
622
                    // process configuration if it is not page-specific or if the specific page is the current page:
623 1
                    if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
624 1
                        $key = $configurationRecord['name'];
625
626
                        // don't overwrite previously defined paramSets
627 1
                        if (!isset($res[$key])) {
628
629
                                /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
630 1
                            $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
631 1
                            $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
632
633
                            $subCfg = [
634 1
                                'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
635 1
                                'procInstrParams.' => $TSparserObject->setup,
636 1
                                'baseUrl' => $this->getBaseUrlForConfigurationRecord(
637 1
                                    $configurationRecord['base_url'],
638 1
                                    (int)$configurationRecord['sys_domain_base_url'],
639 1
                                    (bool)($configurationRecord['force_ssl'] > 0)
640
                                ),
641 1
                                'cHash' => $configurationRecord['chash'],
642 1
                                'userGroups' => $configurationRecord['fegroups'],
643 1
                                'exclude' => $configurationRecord['exclude'],
644 1
                                'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
645 1
                                'key' => $key
646
                            ];
647
648
                            // add trailing slash if not present
649 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
650
                                $subCfg['baseUrl'] .= '/';
651
                            }
652 1
                            if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
653 1
                                $res[$key] = [];
654 1
                                $res[$key]['subCfg'] = $subCfg;
655 1
                                $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
656 1
                                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
657 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
658 1
                                $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
659
                            }
660
                        }
661
                    }
662
                }
663
            }
664
        }
665
666 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
667
            $params = [
668
                'res' => &$res,
669
            ];
670
            GeneralUtility::callUserFunction($func, $params, $this);
671
        }
672 2
        return $res;
673
    }
674
675
    /**
676
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
677
     *
678
     * @param string $baseUrl
679
     * @param integer $sysDomainUid
680
     * @param bool $ssl
681
     * @return string
682
     */
683 4
    protected function getBaseUrlForConfigurationRecord(string $baseUrl, int $sysDomainUid, bool $ssl = false): string
684
    {
685 4
        if ($sysDomainUid > 0) {
686 2
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('sys_domain');
687
            $domainName = $queryBuilder
688 2
                ->select('domainName')
689 2
                ->from('sys_domain')
690 2
                ->where(
691 2
                    $queryBuilder->expr()->eq('uid', $sysDomainUid)
692
                )
693 2
                ->execute()
694 2
                ->fetchColumn();
695
696 2
            if (!empty($domainName)) {
697 1
                $baseUrl = ($ssl ? 'https' : 'http') . '://' . $domainName;
698
            }
699
        }
700 4
        return $baseUrl;
701
    }
702
703
    /**
704
     * @param $rootid
705
     * @param $depth
706
     * @return array
707
     *
708
     * TODO: Write Functional Tests
709
     */
710
    public function getConfigurationsForBranch($rootid, $depth)
711
    {
712
        $configurationsForBranch = [];
713
714
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
715
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
716
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
717
            if (is_array($sets)) {
718
                foreach ($sets as $key => $value) {
719
                    if (!is_array($value)) {
720
                        continue;
721
                    }
722
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
723
                }
724
            }
725
        }
726
        $pids = [];
727
        $rootLine = BackendUtility::BEgetRootLine($rootid);
728
        foreach ($rootLine as $node) {
729
            $pids[] = $node['uid'];
730
        }
731
        /* @var PageTreeView $tree */
732
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
733
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
734
        $tree->init('AND ' . $perms_clause);
735
        $tree->getTree($rootid, $depth, '');
736
        foreach ($tree->tree as $node) {
737
            $pids[] = $node['row']['uid'];
738
        }
739
740
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
741
742
        $queryBuilder->getRestrictions()
743
            ->removeAll()
744
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
745
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
746
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
747
748
        $statement = $queryBuilder
749
            ->select('name')
750
            ->from('tx_crawler_configuration')
751
            ->where(
752
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
753
            )
754
        ->execute();
755
756
        while ($row = $statement->fetch()) {
757
            $configurationsForBranch[] = $row['name'];
758
        }
759
760
        return $configurationsForBranch;
761
    }
762
763
    /**
764
     * Get querybuilder for given table
765
     *
766
     * @param string $table
767
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
768
     */
769 9
    private function getQueryBuilder(string $table)
770
    {
771 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
772 9
            ->getConnectionForTable($table)
773 9
            ->createQueryBuilder();
774
    }
775
776
    /**
777
     * Check if a user has access to an item
778
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
779
     *
780
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
781
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
782
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
783
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
784
     */
785 3
    public function hasGroupAccess($groupList, $accessList)
786
    {
787 3
        if (empty($accessList)) {
788 1
            return true;
789
        }
790 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
791 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
792 1
                return true;
793
            }
794
        }
795 1
        return false;
796
    }
797
798
    /**
799
     * Parse GET vars of input Query into array with key=>value pairs
800
     *
801
     * @param string $inputQuery Input query string
802
     * @return array
803
     */
804 5
    public function parseParams($inputQuery)
805
    {
806
        //echo '<pre>', var_dump($inputQuery), '</pre>';
807
        // Extract all GET parameters into an ARRAY:
808 5
        $paramKeyValues = [];
809 5
        $GETparams = explode('&', $inputQuery);
810
811 5
        foreach ($GETparams as $paramAndValue) {
812 5
            list($p, $v) = explode('=', $paramAndValue, 2);
813 5
            if (strlen($p)) {
814 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
815
            }
816
        }
817
818 5
        return $paramKeyValues;
819
    }
820
821
    /**
822
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
823
     * Syntax of values:
824
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
825
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
826
     * - For each configuration part:
827
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
828
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
829
     *        _ENABLELANG:1 picks only original records without their language overlays
830
     *         - Default: Literal value
831
     *
832
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
833
     * @param integer $pid Current page ID
834
     * @return array
835
     *
836
     * TODO: Write Functional Tests
837
     */
838 2
    public function expandParameters($paramArray, $pid)
839
    {
840
        // Traverse parameter names:
841 2
        foreach ($paramArray as $p => $v) {
842 2
            $v = trim($v);
843
844
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
845 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
846
                // So, find the value inside brackets and reset the paramArray value as an array.
847 2
                $v = substr($v, 1, -1);
848 2
                $paramArray[$p] = [];
849
850
                // Explode parts and traverse them:
851 2
                $parts = explode('|', $v);
852 2
                foreach ($parts as $pV) {
853
854
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
855 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
856
857
                        // Swap if first is larger than last:
858
                        if ($reg[1] > $reg[2]) {
859
                            $temp = $reg[2];
860
                            $reg[2] = $reg[1];
861
                            $reg[1] = $temp;
862
                        }
863
864
                        // Traverse range, add values:
865
                        $runAwayBrake = 1000; // Limit to size of range!
866
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
867
                            $paramArray[$p][] = $a;
868
                            $runAwayBrake--;
869
                            if ($runAwayBrake <= 0) {
870
                                break;
871
                            }
872
                        }
873 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
874
875
                        // Parse parameters:
876
                        $subparts = GeneralUtility::trimExplode(';', $pV);
877
                        $subpartParams = [];
878
                        foreach ($subparts as $spV) {
879
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
880
                            $subpartParams[$pKey] = $pVal;
881
                        }
882
883
                        // Table exists:
884
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
885
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
886
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
887
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
888
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
889
890
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
891
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
892
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
893
894
                                $queryBuilder->getRestrictions()
895
                                    ->removeAll()
896
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
897
898
                                $queryBuilder
899
                                    ->select($fieldName)
900
                                    ->from($subpartParams['_TABLE'])
901
                                    // TODO: Check if this works as intended!
902
                                    ->add('from', $addTable)
903
                                    ->where(
904
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
905
                                        $where
906
                                    );
907
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
908
909
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
910
                                    $queryBuilder->andWhere(
911
                                        $queryBuilder->expr()->lte(
912
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
913
                                            0
914
                                        )
915
                                    );
916
                                }
917
918
                                $statement = $queryBuilder->execute();
919
920
                                $rows = [];
921
                                while ($row = $statement->fetch()) {
922
                                    $rows[$fieldName] = $row;
923
                                }
924
925
                                if (is_array($rows)) {
926
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
927
                                }
928
                            }
929
                        }
930
                    } else { // Just add value:
931 2
                        $paramArray[$p][] = $pV;
932
                    }
933
                    // Hook for processing own expandParameters place holder
934 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
935
                        $_params = [
936
                            'pObj' => &$this,
937
                            'paramArray' => &$paramArray,
938
                            'currentKey' => $p,
939
                            'currentValue' => $pV,
940
                            'pid' => $pid
941
                        ];
942
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
943
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
944
                        }
945
                    }
946
                }
947
948
                // Make unique set of values and sort array by key:
949 2
                $paramArray[$p] = array_unique($paramArray[$p]);
950 2
                ksort($paramArray);
951
            } else {
952
                // Set the literal value as only value in array:
953 2
                $paramArray[$p] = [$v];
954
            }
955
        }
956
957 2
        return $paramArray;
958
    }
959
960
    /**
961
     * Compiling URLs from parameter array (output of expandParameters())
962
     * The number of URLs will be the multiplication of the number of parameter values for each key
963
     *
964
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
965
     * @param array $urls URLs accumulated in this array (for recursion)
966
     * @return array
967
     */
968 5
    public function compileUrls($paramArray, $urls = [])
969
    {
970 5
        if (!empty($paramArray) && is_array($urls)) {
971
            // shift first off stack:
972 4
            reset($paramArray);
973 4
            $varName = key($paramArray);
974 4
            $valueSet = array_shift($paramArray);
975
976
            // Traverse value set:
977 4
            $newUrls = [];
978 4
            $maxCompileUrls = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
979 4
            foreach ($urls as $url) {
980 3
                foreach ($valueSet as $val) {
981 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
982
983 3
                    if (count($newUrls) > $maxCompileUrls) {
984
                        break;
985
                    }
986
                }
987
            }
988 4
            $urls = $newUrls;
989 4
            $urls = $this->compileUrls($paramArray, $urls);
990
        }
991
992 5
        return $urls;
993
    }
994
995
    /************************************
996
     *
997
     * Crawler log
998
     *
999
     ************************************/
1000
1001
    /**
1002
     * Return array of records from crawler queue for input page ID
1003
     *
1004
     * @param integer $id Page ID for which to look up log entries.
1005
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1006
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1007
     * @param boolean $doFullFlush
1008
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1009
     * @return array
1010
     */
1011 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1012
    {
1013 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1014
        $queryBuilder
1015 4
            ->select('*')
1016 4
            ->from($this->tableName)
1017 4
            ->where(
1018 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1019
            )
1020 4
            ->orderBy('scheduled', 'DESC');
1021
1022 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1023 4
            ->getConnectionForTable($this->tableName)
1024 4
            ->getExpressionBuilder();
1025 4
        $query = $expressionBuilder->andX();
1026
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1027
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1028
        // between the statements, it's not a mistake in the code.
1029 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1030 4
        switch ($filter) {
1031 4
            case 'pending':
1032
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1033
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1034
                break;
1035 4
            case 'finished':
1036
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1037
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1038
                break;
1039
        }
1040
1041
        // FIXME: Write unit test that ensures that the right records are deleted.
1042 4
        if ($doFlush) {
1043 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1044 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1045 2
            return [];
1046
        } else {
1047 2
            if ($itemsPerPage > 0) {
1048
                $queryBuilder
1049 2
                    ->setMaxResults((int)$itemsPerPage);
1050
            }
1051
1052 2
            return $queryBuilder->execute()->fetchAll();
1053
        }
1054
    }
1055
1056
    /**
1057
     * Return array of records from crawler queue for input set ID
1058
     *
1059
     * @param integer $set_id Set ID for which to look up log entries.
1060
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1061
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1062
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1063
     * @return array
1064
     */
1065 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1066
    {
1067 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1068
        $queryBuilder
1069 6
            ->select('*')
1070 6
            ->from($this->tableName)
1071 6
            ->where(
1072 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1073
            )
1074 6
            ->orderBy('scheduled', 'DESC');
1075
1076 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1077 6
            ->getConnectionForTable($this->tableName)
1078 6
            ->getExpressionBuilder();
1079 6
        $query = $expressionBuilder->andX();
1080
        // FIXME: Write Unit tests for Filters
1081
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1082
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1083
        // between the statements, it's not a mistake in the code.
1084 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1085 6
        switch ($filter) {
1086 6
            case 'pending':
1087 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1088 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1089 1
                break;
1090 5
            case 'finished':
1091 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1092 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1093 1
                break;
1094
        }
1095
        // FIXME: Write unit test that ensures that the right records are deleted.
1096 6
        if ($doFlush) {
1097 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1098 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1099 4
            return [];
1100
        } else {
1101 2
            if ($itemsPerPage > 0) {
1102
                $queryBuilder
1103 2
                    ->setMaxResults((int)$itemsPerPage);
1104
            }
1105
1106 2
            return $queryBuilder->execute()->fetchAll();
1107
        }
1108
    }
1109
1110
    /**
1111
     * Removes queue entries
1112
     *
1113
     * @param string $where SQL related filter for the entries which should be removed
1114
     * @return void
1115
     */
1116 9
    protected function flushQueue($where = '')
1117
    {
1118 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1119
1120 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1121
1122 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1123
            $groups = $queryBuilder
1124
                ->select('DISTINCT set_id')
1125
                ->from($this->tableName)
1126
                ->where($realWhere)
1127
                ->execute()
1128
                ->fetchAll();
1129
            if (is_array($groups)) {
1130
                foreach ($groups as $group) {
1131
                    $subSet = $queryBuilder
1132
                        ->select('uid', 'set_id')
1133
                        ->from($this->tableName)
1134
                        ->where(
1135
                            $realWhere,
1136
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1137
                        )
1138
                        ->execute()
1139
                        ->fetchAll();
1140
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1141
                }
1142
            }
1143
        }
1144
1145
        $queryBuilder
1146 9
            ->delete($this->tableName)
1147 9
            ->where($realWhere)
1148 9
            ->execute();
1149 9
    }
1150
1151
    /**
1152
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1153
     *
1154
     * @param integer $setId Set ID
1155
     * @param array $params Parameters to pass to call back function
1156
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1157
     * @param integer $page_id Page ID to attach it to
1158
     * @param integer $schedule Time at which to activate
1159
     * @return void
1160
     */
1161
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1162
    {
1163
        if (!is_array($params)) {
1164
            $params = [];
1165
        }
1166
        $params['_CALLBACKOBJ'] = $callBack;
1167
1168
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1169
            ->insert(
1170
                'tx_crawler_queue',
1171
                [
1172
                    'page_id' => intval($page_id),
1173
                    'parameters' => serialize($params),
1174
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1175
                    'exec_time' => 0,
1176
                    'set_id' => intval($setId),
1177
                    'result_data' => '',
1178
                ]
1179
            );
1180
    }
1181
1182
    /************************************
1183
     *
1184
     * URL setting
1185
     *
1186
     ************************************/
1187
1188
    /**
1189
     * Setting a URL for crawling:
1190
     *
1191
     * @param integer $id Page ID
1192
     * @param string $url Complete URL
1193
     * @param array $subCfg Sub configuration array (from TS config)
1194
     * @param integer $tstamp Scheduled-time
1195
     * @param string $configurationHash (optional) configuration hash
1196
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1197
     * @return bool
1198
     */
1199 2
    public function addUrl(
1200
        $id,
1201
        $url,
1202
        array $subCfg,
1203
        $tstamp,
1204
        $configurationHash = '',
1205
        $skipInnerDuplicationCheck = false
1206
    ) {
1207 2
        $urlAdded = false;
1208 2
        $rows = [];
1209
1210
        // Creating parameters:
1211
        $parameters = [
1212 2
            'url' => $url
1213
        ];
1214
1215
        // fe user group simulation:
1216 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1217 2
        if ($uGs) {
1218
            $parameters['feUserGroupList'] = $uGs;
1219
        }
1220
1221
        // Setting processing instructions
1222 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1223 2
        if (is_array($subCfg['procInstrParams.'])) {
1224 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1225
        }
1226
1227
        // Possible TypoScript Template Parents
1228 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1229
1230
        // Compile value array:
1231 2
        $parameters_serialized = serialize($parameters);
1232
        $fieldArray = [
1233 2
            'page_id' => intval($id),
1234 2
            'parameters' => $parameters_serialized,
1235 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1236 2
            'configuration_hash' => $configurationHash,
1237 2
            'scheduled' => $tstamp,
1238 2
            'exec_time' => 0,
1239 2
            'set_id' => intval($this->setID),
1240 2
            'result_data' => '',
1241 2
            'configuration' => $subCfg['key'],
1242
        ];
1243
1244 2
        if ($this->registerQueueEntriesInternallyOnly) {
1245
            //the entries will only be registered and not stored to the database
1246
            $this->queueEntries[] = $fieldArray;
1247
        } else {
1248 2
            if (!$skipInnerDuplicationCheck) {
1249
                // check if there is already an equal entry
1250 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1251
            }
1252
1253 2
            if (empty($rows)) {
1254 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1255 2
                $connectionForCrawlerQueue->insert(
1256 2
                    'tx_crawler_queue',
1257 2
                    $fieldArray
1258
                );
1259 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1260 2
                $rows[] = $uid;
1261 2
                $urlAdded = true;
1262 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1263
            } else {
1264
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1265
            }
1266
        }
1267
1268 2
        return $urlAdded;
1269
    }
1270
1271
    /**
1272
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1273
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1274
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1275
     *
1276
     * @param int $tstamp
1277
     * @param array $fieldArray
1278
     *
1279
     * @return array
1280
     *
1281
     * TODO: Write Functional Tests
1282
     */
1283 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1284
    {
1285 2
        $rows = [];
1286
1287 2
        $currentTime = $this->getCurrentTime();
1288
1289 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1290
        $queryBuilder
1291 2
            ->select('qid')
1292 2
            ->from('tx_crawler_queue');
1293
        //if this entry is scheduled with "now"
1294 2
        if ($tstamp <= $currentTime) {
1295
            if ($this->extensionSettings['enableTimeslot']) {
1296
                $timeBegin = $currentTime - 100;
1297
                $timeEnd = $currentTime + 100;
1298
                $queryBuilder
1299
                    ->where(
1300
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1301
                    )
1302
                    ->orWhere(
1303
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1304
                    );
1305
            } else {
1306
                $queryBuilder
1307
                    ->where(
1308
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1309
                    );
1310
            }
1311 2
        } elseif ($tstamp > $currentTime) {
1312
            //entry with a timestamp in the future need to have the same schedule time
1313
            $queryBuilder
1314 2
                ->where(
1315 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1316
                );
1317
        }
1318
1319
        $statement = $queryBuilder
1320 2
            ->andWhere('exec_time != 0')
1321 2
            ->andWhere('process_id != 0')
1322 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1323 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1324 2
            ->execute();
1325
1326 2
        while ($row = $statement->fetch()) {
1327
            $rows[] = $row['qid'];
1328
        }
1329
1330 2
        return $rows;
1331
    }
1332
1333
    /**
1334
     * Returns the current system time
1335
     *
1336
     * @return int
1337
     */
1338
    public function getCurrentTime()
1339
    {
1340
        return time();
1341
    }
1342
1343
    /************************************
1344
     *
1345
     * URL reading
1346
     *
1347
     ************************************/
1348
1349
    /**
1350
     * Read URL for single queue entry
1351
     *
1352
     * @param integer $queueId
1353
     * @param boolean $force If set, will process even if exec_time has been set!
1354
     * @return integer
1355
     */
1356
    public function readUrl($queueId, $force = false)
1357
    {
1358
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1359
        $ret = 0;
1360
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1361
        // Get entry:
1362
        $queryBuilder
1363
            ->select('*')
1364
            ->from('tx_crawler_queue')
1365
            ->where(
1366
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1367
            );
1368
        if (!$force) {
1369
            $queryBuilder
1370
                ->andWhere('exec_time = 0')
1371
                ->andWhere('process_scheduled > 0');
1372
        }
1373
        $queueRec = $queryBuilder->execute()->fetch();
1374
1375
        if (!is_array($queueRec)) {
1376
            return;
1377
        }
1378
1379
        $parameters = unserialize($queueRec['parameters']);
1380
        if ($parameters['rootTemplatePid']) {
1381
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1382
        } else {
1383
            $this->logger->warning(
1384
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1385
            );
1386
        }
1387
1388
        SignalSlotUtility::emitSignal(
1389
            __CLASS__,
1390
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1391
            [$queueId, &$queueRec]
1392
        );
1393
1394
        // Set exec_time to lock record:
1395
        $field_array = ['exec_time' => $this->getCurrentTime()];
1396
1397
        if (isset($this->processID)) {
1398
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1399
            $field_array['process_id_completed'] = $this->processID;
1400
        }
1401
1402
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1403
            ->update(
1404
                'tx_crawler_queue',
1405
                $field_array,
1406
                [ 'qid' => (int)$queueId ]
1407
            );
1408
1409
        $result = $this->readUrl_exec($queueRec);
1410
        $resultData = unserialize($result['content']);
1411
1412
        //atm there's no need to point to specific pollable extensions
1413
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1414
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1415
                // only check the success value if the instruction is runnig
1416
                // it is important to name the pollSuccess key same as the procInstructions key
1417
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1418
                    $pollable,
1419
                    $resultData['parameters']['procInstructions']
1420
                )
1421
                ) {
1422
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1423
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1424
                    }
1425
                }
1426
            }
1427
        }
1428
1429
        // Set result in log which also denotes the end of the processing of this entry.
1430
        $field_array = ['result_data' => serialize($result)];
1431
1432
        SignalSlotUtility::emitSignal(
1433
            __CLASS__,
1434
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1435
            [$queueId, &$field_array]
1436
        );
1437
1438
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1439
            ->update(
1440
                'tx_crawler_queue',
1441
                $field_array,
1442
                [ 'qid' => (int)$queueId ]
1443
            );
1444
1445
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1446
        return $ret;
1447
    }
1448
1449
    /**
1450
     * Read URL for not-yet-inserted log-entry
1451
     *
1452
     * @param array $field_array Queue field array,
1453
     *
1454
     * @return string
1455
     */
1456
    public function readUrlFromArray($field_array)
1457
    {
1458
1459
            // Set exec_time to lock record:
1460
        $field_array['exec_time'] = $this->getCurrentTime();
1461
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1462
        $connectionForCrawlerQueue->insert(
1463
            'tx_crawler_queue',
1464
            $field_array
1465
        );
1466
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1467
1468
        $result = $this->readUrl_exec($field_array);
1469
1470
        // Set result in log which also denotes the end of the processing of this entry.
1471
        $field_array = ['result_data' => serialize($result)];
1472
1473
        SignalSlotUtility::emitSignal(
1474
            __CLASS__,
1475
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1476
            [$queueId, &$field_array]
1477
        );
1478
1479
        $connectionForCrawlerQueue->update(
1480
            'tx_crawler_queue',
1481
            $field_array,
1482
            ['qid' => $queueId]
1483
        );
1484
1485
        return $result;
1486
    }
1487
1488
    /**
1489
     * Read URL for a queue record
1490
     *
1491
     * @param array $queueRec Queue record
1492
     * @return string
1493
     */
1494
    public function readUrl_exec($queueRec)
1495
    {
1496
        // Decode parameters:
1497
        $parameters = unserialize($queueRec['parameters']);
1498
        $result = 'ERROR';
1499
        if (is_array($parameters)) {
1500
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1501
                $objRef = $parameters['_CALLBACKOBJ'];
1502
                $callBackObj = GeneralUtility::makeInstance($objRef);
1503
                if (is_object($callBackObj)) {
1504
                    unset($parameters['_CALLBACKOBJ']);
1505
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1506
                } else {
1507
                    $result = ['content' => 'No object: ' . $objRef];
1508
                }
1509
            } else { // Regular FE request:
1510
1511
                // Prepare:
1512
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1513
1514
                // Get result:
1515
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1516
1517
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1518
            }
1519
        }
1520
1521
        return $result;
1522
    }
1523
1524
    /**
1525
     * Gets the content of a URL.
1526
     *
1527
     * @param string $originalUrl URL to read
1528
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1529
     * @param integer $timeout Timeout time
1530
     * @param integer $recursion Recursion limiter for 302 redirects
1531
     * @return array|boolean
1532
     */
1533 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1534
    {
1535 2
        if (!$recursion) {
1536
            return false;
1537
        }
1538
1539
        // Parse URL, checking for scheme:
1540 2
        $url = parse_url($originalUrl);
1541
1542 2
        if ($url === false) {
1543
            $this->logger->debug(
1544
                sprintf('Could not parse_url() for string "%s"', $url),
1545
                ['crawlerId' => $crawlerId]
1546
            );
1547
            return false;
1548
        }
1549
1550 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1551
            $this->logger->debug(
1552
                sprintf('Scheme does not match for url "%s"', $url),
1553
                ['crawlerId' => $crawlerId]
1554
            );
1555
            return false;
1556
        }
1557
1558
        // direct request
1559 2
        if ($this->extensionSettings['makeDirectRequests']) {
1560 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1561 2
            return $result;
1562
        }
1563
1564
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1565
1566
        // thanks to Pierrick Caillon for adding proxy support
1567
        $rurl = $url;
1568
1569
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1570
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1571
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1572
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1573
        }
1574
1575
        $host = $rurl['host'];
1576
1577
        if ($url['scheme'] == 'https') {
1578
            $host = 'ssl://' . $host;
1579
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1580
        } else {
1581
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1582
        }
1583
1584
        $startTime = microtime(true);
1585
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1586
1587
        if (!$fp) {
1588
            $this->logger->debug(
1589
                sprintf('Error while opening "%s"', $url),
1590
                ['crawlerId' => $crawlerId]
1591
            );
1592
            return false;
1593
        } else {
1594
            // Request message:
1595
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1596
            fputs($fp, $msg);
1597
1598
            // Read response:
1599
            $d = $this->getHttpResponseFromStream($fp);
1600
            fclose($fp);
1601
1602
            $time = microtime(true) - $startTime;
1603
            $this->logger->info($originalUrl . ' ' . $time);
1604
1605
            // Implode content and headers:
1606
            $result = [
1607
                'request' => $msg,
1608
                'headers' => implode('', $d['headers']),
1609
                'content' => implode('', (array)$d['content'])
1610
            ];
1611
1612
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1613
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1614
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1615
1616
                if (is_array($newRequestUrl)) {
1617
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1618
                } else {
1619
                    $this->logger->debug(
1620
                        sprintf('Error while opening "%s"', $url),
1621
                        ['crawlerId' => $crawlerId]
1622
                    );
1623
                    return false;
1624
                }
1625
            }
1626
1627
            return $result;
1628
        }
1629
    }
1630
1631
    /**
1632
     * Gets the base path of the website frontend.
1633
     * (e.g. if you call http://mydomain.com/cms/index.php in
1634
     * the browser the base path is "/cms/")
1635
     *
1636
     * @return string Base path of the website frontend
1637
     */
1638
    protected function getFrontendBasePath()
1639
    {
1640
        $frontendBasePath = '/';
1641
1642
        // Get the path from the extension settings:
1643
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1644
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1645
        // If empty, try to use config.absRefPrefix:
1646
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1647
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1648
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1649
        } elseif (!Environment::isCli()) {
1650
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1651
        }
1652
1653
        // Base path must be '/<pathSegements>/':
1654
        if ($frontendBasePath !== '/') {
1655
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1656
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1657
        }
1658
1659
        return $frontendBasePath;
1660
    }
1661
1662
    /**
1663
     * Executes a shell command and returns the outputted result.
1664
     *
1665
     * @param string $command Shell command to be executed
1666
     * @return string Outputted result of the command execution
1667
     */
1668
    protected function executeShellCommand($command)
1669
    {
1670
        return shell_exec($command);
1671
    }
1672
1673
    /**
1674
     * Reads HTTP response from the given stream.
1675
     *
1676
     * @param  resource $streamPointer  Pointer to connection stream.
1677
     * @return array                    Associative array with the following items:
1678
     *                                  headers <array> Response headers sent by server.
1679
     *                                  content <array> Content, with each line as an array item.
1680
     */
1681 1
    protected function getHttpResponseFromStream($streamPointer)
1682
    {
1683 1
        $response = ['headers' => [], 'content' => []];
1684
1685 1
        if (is_resource($streamPointer)) {
1686
            // read headers
1687 1
            while ($line = fgets($streamPointer, '2048')) {
1688 1
                $line = trim($line);
1689 1
                if ($line !== '') {
1690 1
                    $response['headers'][] = $line;
1691
                } else {
1692 1
                    break;
1693
                }
1694
            }
1695
1696
            // read content
1697 1
            while ($line = fgets($streamPointer, '2048')) {
1698 1
                $response['content'][] = $line;
1699
            }
1700
        }
1701
1702 1
        return $response;
1703
    }
1704
1705
    /**
1706
     * Builds HTTP request headers.
1707
     *
1708
     * @param array $url
1709
     * @param string $crawlerId
1710
     *
1711
     * @return array
1712
     */
1713 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1714
    {
1715 6
        $reqHeaders = [];
1716 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1717 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1718 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1719 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1720
        }
1721 6
        $reqHeaders[] = 'Connection: close';
1722 6
        if ($url['user'] != '') {
1723 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1724
        }
1725 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1726 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1727 6
        return $reqHeaders;
1728
    }
1729
1730
    /**
1731
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1732
     *
1733
     * @param array $headers HTTP Header
1734
     * @param string $user HTTP Auth. User
1735
     * @param string $pass HTTP Auth. Password
1736
     * @return bool|string
1737
     */
1738 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1739
    {
1740 12
        $header = [];
1741 12
        if (!is_array($headers)) {
1742 1
            return false;
1743
        }
1744 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1745 2
            return false;
1746
        }
1747
1748 9
        foreach ($headers as $hl) {
1749 9
            $tmp = explode(": ", $hl);
1750 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1751 9
            if (trim($tmp[0]) == 'Location') {
1752 6
                break;
1753
            }
1754
        }
1755 9
        if (!array_key_exists('Location', $header)) {
1756 3
            return false;
1757
        }
1758
1759 6
        if ($user != '') {
1760 3
            if (!($tmp = parse_url($header['Location']))) {
1761 1
                return false;
1762
            }
1763 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1764 2
            if ($tmp['query'] != '') {
1765 2
                $newUrl .= '?' . $tmp['query'];
1766
            }
1767
        } else {
1768 3
            $newUrl = $header['Location'];
1769
        }
1770 5
        return $newUrl;
1771
    }
1772
1773
    /**************************
1774
     *
1775
     * tslib_fe hooks:
1776
     *
1777
     **************************/
1778
1779
    /**
1780
     * Initialization hook (called after database connection)
1781
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1782
     *
1783
     * @param array $params Parameters from frontend
1784
     * @param object $ref TSFE object (reference under PHP5)
1785
     * @return void
1786
     *
1787
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1788
     * FIXME: I think this can be removed. (TNM)
1789
     */
1790
    public function fe_init(&$params, $ref)
1791
    {
1792
        // Authenticate crawler request:
1793
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1794
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1795
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1796
1797
            $queueRec = $queryBuilder
1798
                ->select('*')
1799
                ->from('tx_crawler_queue')
1800
                ->where(
1801
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1802
                )
1803
                ->execute()
1804
                ->fetch();
1805
1806
            // If a crawler record was found and hash was matching, set it up:
1807
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1808
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1809
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1810
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1811
            } else {
1812
                die('No crawler entry found!');
1813
            }
1814
        }
1815
    }
1816
1817
    /*****************************
1818
     *
1819
     * Compiling URLs to crawl - tools
1820
     *
1821
     *****************************/
1822
1823
    /**
1824
     * @param integer $id Root page id to start from.
1825
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1826
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1827
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1828
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1829
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1830
     * @param array $incomingProcInstructions Array of processing instructions
1831
     * @param array $configurationSelection Array of configuration keys
1832
     * @return string
1833
     */
1834
    public function getPageTreeAndUrls(
1835
        $id,
1836
        $depth,
1837
        $scheduledTime,
1838
        $reqMinute,
1839
        $submitCrawlUrls,
1840
        $downloadCrawlUrls,
1841
        array $incomingProcInstructions,
1842
        array $configurationSelection
1843
    ) {
1844
        $this->scheduledTime = $scheduledTime;
1845
        $this->reqMinute = $reqMinute;
1846
        $this->submitCrawlUrls = $submitCrawlUrls;
1847
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1848
        $this->incomingProcInstructions = $incomingProcInstructions;
1849
        $this->incomingConfigurationSelection = $configurationSelection;
1850
1851
        $this->duplicateTrack = [];
1852
        $this->downloadUrls = [];
1853
1854
        // Drawing tree:
1855
        /* @var PageTreeView $tree */
1856
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1857
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1858
        $tree->init('AND ' . $perms_clause);
1859
1860
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1861
        if (is_array($pageInfo)) {
1862
            // Set root row:
1863
            $tree->tree[] = [
1864
                'row' => $pageInfo,
1865
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1866
            ];
1867
        }
1868
1869
        // Get branch beneath:
1870
        if ($depth) {
1871
            $tree->getTree($id, $depth, '');
1872
        }
1873
1874
        // Traverse page tree:
1875
        $code = '';
1876
1877
        foreach ($tree->tree as $data) {
1878
            $this->MP = false;
1879
1880
            // recognize mount points
1881
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1882
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1883
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1884
                $mountpage = $queryBuilder
1885
                    ->select('*')
1886
                    ->from('pages')
1887
                    ->where(
1888
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1889
                    )
1890
                    ->execute()
1891
                    ->fetchAll();
1892
                $queryBuilder->getRestrictions()->reset();
1893
1894
                // fetch mounted pages
1895
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1896
1897
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1898
                $mountTree->init('AND ' . $perms_clause);
1899
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1900
1901
                foreach ($mountTree->tree as $mountData) {
1902
                    $code .= $this->drawURLs_addRowsForPage(
1903
                        $mountData['row'],
1904
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1905
                    );
1906
                }
1907
1908
                // replace page when mount_pid_ol is enabled
1909
                if ($mountpage[0]['mount_pid_ol']) {
1910
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1911
                } else {
1912
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1913
                    $this->MP = false;
1914
                }
1915
            }
1916
1917
            $code .= $this->drawURLs_addRowsForPage(
1918
                $data['row'],
1919
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1920
            );
1921
        }
1922
1923
        return $code;
1924
    }
1925
1926
    /**
1927
     * Expands exclude string
1928
     *
1929
     * @param string $excludeString Exclude string
1930
     * @return array
1931
     */
1932 1
    public function expandExcludeString($excludeString)
1933
    {
1934
        // internal static caches;
1935 1
        static $expandedExcludeStringCache;
1936 1
        static $treeCache;
1937
1938 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1939 1
            $pidList = [];
1940
1941 1
            if (!empty($excludeString)) {
1942
                /** @var PageTreeView $tree */
1943
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1944
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1945
1946
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1947
1948
                foreach ($excludeParts as $excludePart) {
1949
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1950
1951
                    // default is "page only" = "depth=0"
1952
                    if (empty($depth)) {
1953
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1954
                    }
1955
1956
                    $pidList[] = $pid;
1957
1958
                    if ($depth > 0) {
1959
                        if (empty($treeCache[$pid][$depth])) {
1960
                            $tree->reset();
1961
                            $tree->getTree($pid, $depth);
1962
                            $treeCache[$pid][$depth] = $tree->tree;
1963
                        }
1964
1965
                        foreach ($treeCache[$pid][$depth] as $data) {
1966
                            $pidList[] = $data['row']['uid'];
1967
                        }
1968
                    }
1969
                }
1970
            }
1971
1972 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1973
        }
1974
1975 1
        return $expandedExcludeStringCache[$excludeString];
1976
    }
1977
1978
    /**
1979
     * Create the rows for display of the page tree
1980
     * For each page a number of rows are shown displaying GET variable configuration
1981
     *
1982
     * @param    array        Page row
1983
     * @param    string        Page icon and title for row
1984
     * @return    string        HTML <tr> content (one or more)
1985
     */
1986
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1987
    {
1988
        $skipMessage = '';
1989
1990
        // Get list of configurations
1991
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1992
1993
        if (!empty($this->incomingConfigurationSelection)) {
1994
            // remove configuration that does not match the current selection
1995
            foreach ($configurations as $confKey => $confArray) {
1996
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1997
                    unset($configurations[$confKey]);
1998
                }
1999
            }
2000
        }
2001
2002
        // Traverse parameter combinations:
2003
        $c = 0;
2004
        $content = '';
2005
        if (!empty($configurations)) {
2006
            foreach ($configurations as $confKey => $confArray) {
2007
2008
                    // Title column:
2009
                if (!$c) {
2010
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2011
                } else {
2012
                    $titleClm = '';
2013
                }
2014
2015
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2016
2017
                        // URL list:
2018
                    $urlList = $this->urlListFromUrlArray(
2019
                        $confArray,
2020
                        $pageRow,
2021
                        $this->scheduledTime,
2022
                        $this->reqMinute,
2023
                        $this->submitCrawlUrls,
2024
                        $this->downloadCrawlUrls,
2025
                        $this->duplicateTrack,
2026
                        $this->downloadUrls,
2027
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2028
                    );
2029
2030
                    // Expanded parameters:
2031
                    $paramExpanded = '';
2032
                    $calcAccu = [];
2033
                    $calcRes = 1;
2034
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2035
                        $paramExpanded .= '
2036
                            <tr>
2037
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2038
                                                '(' . count($gVal) . ')' .
2039
                                                '</td>
2040
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2041
                            </tr>
2042
                        ';
2043
                        $calcRes *= count($gVal);
2044
                        $calcAccu[] = count($gVal);
2045
                    }
2046
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2047
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2048
2049
                    // Options
2050
                    $optionValues = '';
2051
                    if ($confArray['subCfg']['userGroups']) {
2052
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2053
                    }
2054
                    if ($confArray['subCfg']['baseUrl']) {
2055
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2056
                    }
2057
                    if ($confArray['subCfg']['procInstrFilter']) {
2058
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2059
                    }
2060
2061
                    // Compile row:
2062
                    $content .= '
2063
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2064
                            ' . $titleClm . '
2065
                            <td>' . htmlspecialchars($confKey) . '</td>
2066
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2067
                            <td>' . $paramExpanded . '</td>
2068
                            <td nowrap="nowrap">' . $urlList . '</td>
2069
                            <td nowrap="nowrap">' . $optionValues . '</td>
2070
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2071
                        </tr>';
2072
                } else {
2073
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2074
                            ' . $titleClm . '
2075
                            <td>' . htmlspecialchars($confKey) . '</td>
2076
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2077
                        </tr>';
2078
                }
2079
2080
                $c++;
2081
            }
2082
        } else {
2083
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2084
2085
            // Compile row:
2086
            $content .= '
2087
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2088
                    <td>' . $pageTitleAndIcon . '</td>
2089
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2090
                </tr>';
2091
        }
2092
2093
        return $content;
2094
    }
2095
2096
    /*****************************
2097
     *
2098
     * CLI functions
2099
     *
2100
     *****************************/
2101
2102
    /**
2103
     * Running the functionality of the CLI (crawling URLs from queue)
2104
     *
2105
     * @param int $countInARun
2106
     * @param int $sleepTime
2107
     * @param int $sleepAfterFinish
2108
     * @return string
2109
     */
2110
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2111
    {
2112
        $result = 0;
2113
        $counter = 0;
2114
2115
        // First, run hooks:
2116
        $this->CLI_runHooks();
2117
2118
        // Clean up the queue
2119
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2120
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2121
2122
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2123
            $del = $queryBuilderDelete
2124
                ->delete($this->tableName)
2125
                ->where(
2126
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2127
                )->execute();
2128
2129
            if (false === $del) {
2130
                $this->logger->info(
2131
                    'Records could not be deleted.'
2132
                );
2133
            }
2134
        }
2135
2136
        // Select entries:
2137
        //TODO Shouldn't this reside within the transaction?
2138
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2139
        $rows = $queryBuilderSelect
2140
            ->select('qid', 'scheduled')
2141
            ->from('tx_crawler_queue')
2142
            ->where(
2143
                $queryBuilderSelect->expr()->eq('exec_time', 0),
2144
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
2145
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
2146
            )
2147
            ->orderBy('scheduled')
2148
            ->addOrderBy('qid')
2149
            ->setMaxResults($countInARun)
2150
            ->execute()
2151
            ->fetchAll();
2152
2153
        if (!empty($rows)) {
2154
            $quidList = [];
2155
2156
            foreach ($rows as $r) {
2157
                $quidList[] = $r['qid'];
2158
            }
2159
2160
            $processId = $this->CLI_buildProcessId();
2161
2162
            //reserve queue entries for process
2163
2164
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2165
            //TODO make sure we're not taking assigned queue-entires
2166
2167
            //save the number of assigned queue entrys to determine who many have been processed later
2168
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2169
            $numberOfAffectedRows = $queryBuilderUpdate
2170
                ->update('tx_crawler_queue')
2171
                ->where(
2172
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
2173
                )
2174
                ->set('process_scheduled', $this->getCurrentTime())
2175
                ->set('process_id', $queryBuilderUpdate->createNamedParameter($processId, \PDO::PARAM_STR))
2176
                ->execute();
2177
2178
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2179
                ->update(
2180
                    'tx_crawler_process',
2181
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2182
                    [ 'process_id' => (int) $processId ]
2183
                );
2184
2185
            if ($numberOfAffectedRows == count($quidList)) {
2186
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2187
            } else {
2188
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2189
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2190
                return ($result | self::CLI_STATUS_ABORTED);
2191
            }
2192
2193
            foreach ($rows as $r) {
2194
                $result |= $this->readUrl($r['qid']);
2195
2196
                $counter++;
2197
                usleep(intval($sleepTime)); // Just to relax the system
2198
2199
                // if during the start and the current read url the cli has been disable we need to return from the function
2200
                // mark the process NOT as ended.
2201
                if ($this->getDisabled()) {
2202
                    return ($result | self::CLI_STATUS_ABORTED);
2203
                }
2204
2205
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2206
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2207
2208
                    //TODO might need an additional returncode
2209
                    $result |= self::CLI_STATUS_ABORTED;
2210
                    break; //possible timeout
2211
                }
2212
            }
2213
2214
            sleep(intval($sleepAfterFinish));
2215
2216
            $msg = 'Rows: ' . $counter;
2217
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2218
        } else {
2219
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2220
        }
2221
2222
        if ($counter > 0) {
2223
            $result |= self::CLI_STATUS_PROCESSED;
2224
        }
2225
2226
        return $result;
2227
    }
2228
2229
    /**
2230
     * Activate hooks
2231
     *
2232
     * @return void
2233
     */
2234
    public function CLI_runHooks()
2235
    {
2236
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
2237
            $hookObj = GeneralUtility::makeInstance($objRef);
2238
            if (is_object($hookObj)) {
2239
                $hookObj->crawler_init($this);
2240
            }
2241
        }
2242
    }
2243
2244
    /**
2245
     * Try to acquire a new process with the given id
2246
     * also performs some auto-cleanup for orphan processes
2247
     * @todo preemption might not be the most elegant way to clean up
2248
     *
2249
     * @param string $id identification string for the process
2250
     * @return boolean
2251
     */
2252
    public function CLI_checkAndAcquireNewProcess($id)
2253
    {
2254
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2255
        $ret = true;
2256
2257
        $systemProcessId = getmypid();
2258
        if ($systemProcessId < 1) {
2259
            return false;
2260
        }
2261
2262
        $processCount = 0;
2263
        $orphanProcesses = [];
2264
2265
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2266
2267
        $statement = $queryBuilder
2268
            ->select('process_id', 'ttl')
2269
            ->from('tx_crawler_process')
2270
            ->where(
2271
                'active = 1 AND deleted = 0'
2272
            )
2273
            ->execute();
2274
2275
        $currentTime = $this->getCurrentTime();
2276
2277
        while ($row = $statement->fetch()) {
2278
            if ($row['ttl'] < $currentTime) {
2279
                $orphanProcesses[] = $row['process_id'];
2280
            } else {
2281
                $processCount++;
2282
            }
2283
        }
2284
2285
        // if there are less than allowed active processes then add a new one
2286
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2287
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2288
2289
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2290
                'tx_crawler_process',
2291
                [
2292
                    'process_id' => $id,
2293
                    'active' => 1,
2294
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2295
                    'system_process_id' => $systemProcessId
2296
                ]
2297
            );
2298
        } else {
2299
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2300
            $ret = false;
2301
        }
2302
2303
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2304
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2305
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2306
2307
        return $ret;
2308
    }
2309
2310
    /**
2311
     * Release a process and the required resources
2312
     *
2313
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2314
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2315
     * @return boolean
2316
     */
2317
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2318
    {
2319
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2320
2321
        if (!is_array($releaseIds)) {
2322
            $releaseIds = [$releaseIds];
2323
        }
2324
2325
        if (empty($releaseIds)) {
2326
            return false;   //nothing to release
2327
        }
2328
2329
        if (!$withinLock) {
2330
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2331
        }
2332
2333
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2334
        // this ensures that a single process can't mess up the entire process table
2335
2336
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2337
2338
        $queryBuilder
2339
        ->update('tx_crawler_queue', 'q')
2340
        ->where(
2341
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2342
        )
2343
        ->set('q.process_scheduled', 0)
2344
        ->set('q.process_id', '')
2345
        ->execute();
2346
2347
        // FIXME: Not entirely sure that this is equivalent to the previous version
2348
        $queryBuilder->resetQueryPart('set');
2349
2350
        $queryBuilder
2351
            ->update('tx_crawler_process')
2352
            ->where(
2353
                $queryBuilder->expr()->eq('active', 0),
2354
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2355
            )
2356
            ->set('system_process_id', 0)
2357
            ->execute();
2358
        // previous version for reference
2359
        /*
2360
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2361
            'tx_crawler_process',
2362
            'active=0 AND deleted=0
2363
            AND NOT EXISTS (
2364
                SELECT * FROM tx_crawler_queue
2365
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2366
                AND tx_crawler_queue.exec_time = 0
2367
            )',
2368
            [
2369
                'deleted' => '1',
2370
                'system_process_id' => 0
2371
            ]
2372
        );*/
2373
        // mark all requested processes as non-active
2374
        $queryBuilder
2375
            ->update('tx_crawler_process')
2376
            ->where(
2377
                'NOT EXISTS (
2378
                SELECT * FROM tx_crawler_queue
2379
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2380
                    AND tx_crawler_queue.exec_time = 0
2381
                )',
2382
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2383
                $queryBuilder->expr()->eq('deleted', 0)
2384
            )
2385
            ->set('active', 0)
2386
            ->execute();
2387
        $queryBuilder->resetQueryPart('set');
2388
        $queryBuilder
2389
            ->update('tx_crawler_queue')
2390
            ->where(
2391
                $queryBuilder->expr()->eq('exec_time', 0),
2392
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2393
            )
2394
            ->set('process_scheduled', 0)
2395
            ->set('process_id', '')
2396
            ->execute();
2397
2398
        if (!$withinLock) {
2399
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2400
        }
2401
2402
        return true;
2403
    }
2404
2405
    /**
2406
     * Check if there are still resources left for the process with the given id
2407
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2408
     *
2409
     * @param  string  identification string for the process
2410
     * @return boolean determines if the process is still active / has resources
2411
     *
2412
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2413
     */
2414 1
    public function CLI_checkIfProcessIsActive($pid)
2415
    {
2416 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2417 1
        $ret = false;
2418
2419
        $statement = $queryBuilder
2420 1
            ->from('tx_crawler_process')
2421 1
            ->select('active')
2422 1
            ->where(
2423 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2424
            )
2425 1
            ->orderBy('ttl')
2426 1
            ->execute();
2427
2428 1
        if ($row = $statement->fetch(0)) {
2429 1
            $ret = intVal($row['active']) == 1;
2430
        }
2431
2432 1
        return $ret;
2433
    }
2434
2435
    /**
2436
     * Create a unique Id for the current process
2437
     *
2438
     * @return string  the ID
2439
     */
2440 2
    public function CLI_buildProcessId()
2441
    {
2442 2
        if (!$this->processID) {
2443 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2444
        }
2445 2
        return $this->processID;
2446
    }
2447
2448
    /**
2449
     * @param bool $get_as_float
2450
     *
2451
     * @return mixed
2452
     */
2453
    protected function microtime($get_as_float = false)
2454
    {
2455
        return microtime($get_as_float);
2456
    }
2457
2458
    /**
2459
     * Prints a message to the stdout (only if debug-mode is enabled)
2460
     *
2461
     * @param  string $msg  the message
2462
     */
2463
    public function CLI_debug($msg)
2464
    {
2465
        if (intval($this->extensionSettings['processDebug'])) {
2466
            echo $msg . "\n";
2467
            flush();
2468
        }
2469
    }
2470
2471
    /**
2472
     * Get URL content by making direct request to TYPO3.
2473
     *
2474
     * @param  string $url          Page URL
2475
     * @param  int    $crawlerId    Crawler-ID
2476
     * @return array
2477
     */
2478 2
    protected function sendDirectRequest($url, $crawlerId)
2479
    {
2480 2
        $parsedUrl = parse_url($url);
2481 2
        if (!is_array($parsedUrl)) {
2482
            return [];
2483
        }
2484
2485 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2486
2487 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2488 2
        $cmd .= ' ';
2489 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2490 2
        $cmd .= ' ';
2491 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2492 2
        $cmd .= ' ';
2493 2
        $cmd .= escapeshellarg($url);
2494 2
        $cmd .= ' ';
2495 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2496
2497 2
        $startTime = microtime(true);
2498 2
        $content = $this->executeShellCommand($cmd);
2499 2
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
2500
2501
        $result = [
2502 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2503 2
            'headers' => '',
2504 2
            'content' => $content
2505
        ];
2506
2507 2
        return $result;
2508
    }
2509
2510
    /**
2511
     * Cleans up entries that stayed for too long in the queue. These are:
2512
     * - processed entries that are over 1.5 days in age
2513
     * - scheduled entries that are over 7 days old
2514
     *
2515
     * @return void
2516
     */
2517
    public function cleanUpOldQueueEntries()
2518
    {
2519
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2520
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2521
2522
        $now = time();
2523
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2524
        $this->flushQueue($condition);
2525
    }
2526
2527
    /**
2528
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2529
     *
2530
     * @param int $pageId
2531
     * @return void
2532
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2533
     * @throws \TYPO3\CMS\Core\Http\ImmediateResponseException
2534
     */
2535
    protected function initTSFE(int $pageId): void
2536
    {
2537
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(
2538
            TypoScriptFrontendController::class,
2539
            null,
2540
            $pageId,
2541
            0
2542
        );
2543
        $GLOBALS['TSFE']->initFEuser();
2544
        $GLOBALS['TSFE']->determineId();
2545
        $GLOBALS['TSFE']->getConfigArray();
2546
        $GLOBALS['TSFE']->settingLanguage();
2547
        $GLOBALS['TSFE']->settingLocale();
2548
        $GLOBALS['TSFE']->newCObj();
2549
    }
2550
2551
    /**
2552
     * Returns a md5 hash generated from a serialized configuration array.
2553
     *
2554
     * @param array $configuration
2555
     *
2556
     * @return string
2557
     */
2558 7
    protected function getConfigurationHash(array $configuration)
2559
    {
2560 7
        unset($configuration['paramExpanded']);
2561 7
        unset($configuration['URLs']);
2562 7
        return md5(serialize($configuration));
2563
    }
2564
}
2565