Completed
Push — typo3v9 ( f7f442...818151 )
by Tomas Norre
06:06
created

CrawlerController::getConfigurationsForBranch()   B

Complexity

Conditions 7
Paths 32

Size

Total Lines 44

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 0
Metric Value
cc 7
nc 32
nop 2
dl 0
loc 44
ccs 0
cts 30
cp 0
crap 56
rs 8.2826
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
30
use AOE\Crawler\Domain\Repository\ProcessRepository;
31
use AOE\Crawler\Domain\Repository\QueueRepository;
32
use AOE\Crawler\Event\EventDispatcher;
33
use AOE\Crawler\Utility\IconUtility;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use Psr\Log\LoggerAwareInterface;
36
use Psr\Log\LoggerAwareTrait;
37
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
38
use TYPO3\CMS\Backend\Utility\BackendUtility;
39
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
40
use TYPO3\CMS\Core\Core\Environment;
41
use TYPO3\CMS\Core\Database\Connection;
42
use TYPO3\CMS\Core\Database\ConnectionPool;
43
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
44
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
45
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
46
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
47
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
48
use TYPO3\CMS\Core\Utility\DebugUtility;
49
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
50
use TYPO3\CMS\Core\Utility\GeneralUtility;
51
use TYPO3\CMS\Core\Utility\MathUtility;
52
use TYPO3\CMS\Extbase\Object\ObjectManager;
53
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
54
use TYPO3\CMS\Frontend\Page\PageRepository;
55
56
/**
57
 * Class CrawlerController
58
 *
59
 * @package AOE\Crawler\Controller
60
 */
61
class CrawlerController implements LoggerAwareInterface
62
{
63
    use LoggerAwareTrait;
64
65
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
66
    const CLI_STATUS_REMAIN = 1; //queue not empty
67
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
68
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
69
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
70
71
    /**
72
     * @var integer
73
     */
74
    public $setID = 0;
75
76
    /**
77
     * @var string
78
     */
79
    public $processID = '';
80
81
    /**
82
     * @var array
83
     */
84
    public $duplicateTrack = [];
85
86
    /**
87
     * @var array
88
     */
89
    public $downloadUrls = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $incomingProcInstructions = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingConfigurationSelection = [];
100
101
    /**
102
     * @var bool
103
     */
104
    public $registerQueueEntriesInternallyOnly = false;
105
106
    /**
107
     * @var array
108
     */
109
    public $queueEntries = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $urlList = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $extensionSettings = [];
120
121
    /**
122
     * Mount Point
123
     *
124
     * @var boolean
125
     */
126
    public $MP = false;
127
128
    /**
129
     * @var string
130
     */
131
    protected $processFilename;
132
133
    /**
134
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
135
     *
136
     * @var string
137
     */
138
    protected $accessMode;
139
140
    /**
141
     * @var BackendUserAuthentication
142
     */
143
    private $backendUser;
144
145
    /**
146
     * @var integer
147
     */
148
    private $scheduledTime = 0;
149
150
    /**
151
     * @var integer
152
     */
153
    private $reqMinute = 0;
154
155
    /**
156
     * @var bool
157
     */
158
    private $submitCrawlUrls = false;
159
160
    /**
161
     * @var bool
162
     */
163
    private $downloadCrawlUrls = false;
164
165
    /**
166
     * @var QueueRepository
167
     */
168
    protected $queueRepository;
169
170
    /**
171
     * @var ProcessRepository
172
     */
173
    protected $processRepository;
174
175
    /**
176
     * @var ConfigurationRepository
177
     */
178
    protected $configurationRepository;
179
180
    /**
181
     * @var string
182
     */
183
    protected $tableName = 'tx_crawler_queue';
184
185
186
    /**
187
     * @var int
188
     */
189
    protected $maximumUrlsToCompile = 10000;
190
191
    /**
192
     * Method to set the accessMode can be gui, cli or cli_im
193
     *
194
     * @return string
195
     */
196 1
    public function getAccessMode()
197
    {
198 1
        return $this->accessMode;
199
    }
200
201
    /**
202
     * @param string $accessMode
203
     */
204 1
    public function setAccessMode($accessMode)
205
    {
206 1
        $this->accessMode = $accessMode;
207 1
    }
208
209
    /**
210
     * Set disabled status to prevent processes from being processed
211
     *
212
     * @param  bool $disabled (optional, defaults to true)
213
     * @return void
214
     */
215 3
    public function setDisabled($disabled = true)
216
    {
217 3
        if ($disabled) {
218 2
            GeneralUtility::writeFile($this->processFilename, '');
219
        } else {
220 1
            if (is_file($this->processFilename)) {
221 1
                unlink($this->processFilename);
222
            }
223
        }
224 3
    }
225
226
    /**
227
     * Get disable status
228
     *
229
     * @return bool true if disabled
230
     */
231 3
    public function getDisabled()
232
    {
233 3
        return is_file($this->processFilename);
234
    }
235
236
    /**
237
     * @param string $filenameWithPath
238
     *
239
     * @return void
240
     */
241 4
    public function setProcessFilename($filenameWithPath)
242
    {
243 4
        $this->processFilename = $filenameWithPath;
244 4
    }
245
246
    /**
247
     * @return string
248
     */
249 1
    public function getProcessFilename()
250
    {
251 1
        return $this->processFilename;
252
    }
253
254
    /************************************
255
     *
256
     * Getting URLs based on Page TSconfig
257
     *
258
     ************************************/
259
260 31
    public function __construct()
261
    {
262 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
263 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
264 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
265 31
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
266
267 31
        $this->backendUser = $GLOBALS['BE_USER'];
268 31
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
269
270
        /** @var ExtensionConfigurationProvider $configurationProvider */
271 31
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
272 31
        $settings = $configurationProvider->getExtensionConfiguration();
273 31
        $this->extensionSettings = is_array($settings) ? $settings : [];
274
275
        // set defaults:
276 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
277
            $this->extensionSettings['countInARun'] = 100;
278
        }
279
280 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
281 31
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
282 31
    }
283
284
    /**
285
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
286
     *
287
     * @param array $extensionSettings
288
     * @return void
289
     */
290 9
    public function setExtensionSettings(array $extensionSettings)
291
    {
292 9
        $this->extensionSettings = $extensionSettings;
293 9
    }
294
295
    /**
296
     * Check if the given page should be crawled
297
     *
298
     * @param array $pageRow
299
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
300
     */
301 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
302
    {
303 8
        $skipPage = false;
304 8
        $skipMessage = 'Skipped'; // message will be overwritten later
305
306
        // if page is hidden
307 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
308 8
            if ($pageRow['hidden']) {
309 1
                $skipPage = true;
310 1
                $skipMessage = 'Because page is hidden';
311
            }
312
        }
313
314 8
        if (!$skipPage) {
315 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
316 3
                $skipPage = true;
317 3
                $skipMessage = 'Because doktype is not allowed';
318
            }
319
        }
320
321 8
        if (!$skipPage) {
322 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
323 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
324 1
                    $skipPage = true;
325 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
326 1
                    break;
327
                }
328
            }
329
        }
330
331 8
        if (!$skipPage) {
332
            // veto hook
333 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
334
                $params = [
335
                    'pageRow' => $pageRow
336
                ];
337
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
338
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
339
                if ($veto !== false) {
340
                    $skipPage = true;
341
                    if (is_string($veto)) {
342
                        $skipMessage = $veto;
343
                    } else {
344
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
345
                    }
346
                    // no need to execute other hooks if a previous one return a veto
347
                    break;
348
                }
349
            }
350
        }
351
352 8
        return $skipPage ? $skipMessage : false;
353
    }
354
355
    /**
356
     * Wrapper method for getUrlsForPageId()
357
     * It returns an array of configurations and no urls!
358
     *
359
     * @param array $pageRow Page record with at least dok-type and uid columns.
360
     * @param string $skipMessage
361
     * @return array
362
     * @see getUrlsForPageId()
363
     */
364 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
365
    {
366 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
367
368 4
        if ($message === false) {
369 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
370 3
            $skipMessage = '';
371
        } else {
372 1
            $skipMessage = $message;
373 1
            $res = [];
374
        }
375
376 4
        return $res;
377
    }
378
379
    /**
380
     * This method is used to count if there are ANY unprocessed queue entries
381
     * of a given page_id and the configuration which matches a given hash.
382
     * If there if none, we can skip an inner detail check
383
     *
384
     * @param  int $uid
385
     * @param  string $configurationHash
386
     * @return boolean
387
     */
388 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
389
    {
390 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
391 5
        $noUnprocessedQueueEntriesFound = true;
392
393
        $result = $queryBuilder
394 5
            ->count('*')
395 5
            ->from($this->tableName)
396 5
            ->where(
397 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
398 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
399 5
                $queryBuilder->expr()->eq('exec_time', 0)
400
            )
401 5
            ->execute()
402 5
            ->fetchColumn();
403
404 5
        if ($result) {
405 3
            $noUnprocessedQueueEntriesFound = false;
406
        }
407
408 5
        return $noUnprocessedQueueEntriesFound;
409
    }
410
411
    /**
412
     * Creates a list of URLs from input array (and submits them to queue if asked for)
413
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
414
     *
415
     * @param    array        Information about URLs from pageRow to crawl.
416
     * @param    array        Page row
417
     * @param    integer        Unix time to schedule indexing to, typically time()
418
     * @param    integer        Number of requests per minute (creates the interleave between requests)
419
     * @param    boolean        If set, submits the URLs to queue
420
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
421
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
422
     * @param    array        Array which will be filled with URLS for download if flag is set.
423
     * @param    array        Array of processing instructions
424
     * @return    string        List of URLs (meant for display in backend module)
425
     *
426
     */
427 2
    public function urlListFromUrlArray(
428
        array $vv,
429
        array $pageRow,
430
        $scheduledTime,
431
        $reqMinute,
432
        $submitCrawlUrls,
433
        $downloadCrawlUrls,
434
        array &$duplicateTrack,
435
        array &$downloadUrls,
436
        array $incomingProcInstructions
437
    ) {
438 2
        $urlList = '';
439
440 2
        if (is_array($vv['URLs'])) {
441 2
            $configurationHash = $this->getConfigurationHash($vv);
442 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
443
444 2
            foreach ($vv['URLs'] as $urlQuery) {
445 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
446
447
                    // Calculate cHash:
448 2
                    if ($vv['subCfg']['cHash']) {
449
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
450
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
451
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
452
                    }
453
454
                    // Create key by which to determine unique-ness:
455 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
456 2
                    $urlQuery = 'index.php' . $urlQuery;
457
458
                    // Scheduled time:
459 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
460 2
                    $schTime = floor($schTime / 60) * 60;
461
462 2
                    if (isset($duplicateTrack[$uKey])) {
463
464
                        //if the url key is registered just display it and do not resubmit is
465
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
466
                    } else {
467 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
468 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
469
470 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
471
472
                        // Submit for crawling!
473 2
                        if ($submitCrawlUrls) {
474 2
                            $added = $this->addUrl(
475 2
                                $pageRow['uid'],
476 2
                                $theUrl,
477 2
                                $vv['subCfg'],
478 2
                                $scheduledTime,
479 2
                                $configurationHash,
480 2
                                $skipInnerCheck
481
                            );
482 2
                            if ($added === false) {
483 2
                                $urlList .= ' (Url already existed)';
484
                            }
485
                        } elseif ($downloadCrawlUrls) {
486
                            $downloadUrls[$theUrl] = $theUrl;
487
                        }
488
489 2
                        $urlList .= '<br />';
490
                    }
491 2
                    $duplicateTrack[$uKey] = true;
492
                }
493
            }
494
        } else {
495
            $urlList = 'ERROR - no URL generated';
496
        }
497
498 2
        return $urlList;
499
    }
500
501
    /**
502
     * Returns true if input processing instruction is among registered ones.
503
     *
504
     * @param string $piString PI to test
505
     * @param array $incomingProcInstructions Processing instructions
506
     * @return boolean
507
     */
508 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
509
    {
510 5
        if (empty($incomingProcInstructions)) {
511 1
            return true;
512
        }
513
514 4
        foreach ($incomingProcInstructions as $pi) {
515 4
            if (GeneralUtility::inList($piString, $pi)) {
516 2
                return true;
517
            }
518
        }
519 2
        return false;
520
    }
521
522 2
    public function getPageTSconfigForId($id)
523
    {
524 2
        if (!$this->MP) {
525 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
526
        } else {
527
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
The variable $mountPointId does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
528
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
529
        }
530
531
        // Call a hook to alter configuration
532 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
533
            $params = [
534
                'pageId' => $id,
535
                'pageTSConfig' => &$pageTSconfig
536
            ];
537
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
538
                GeneralUtility::callUserFunction($userFunc, $params, $this);
539
            }
540
        }
541 2
        return $pageTSconfig;
542
    }
543
544
    /**
545
     * This methods returns an array of configurations.
546
     * And no urls!
547
     *
548
     * @param integer $id Page ID
0 ignored issues
show
Bug introduced by
There is no parameter named $id. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
549
     * @return array
550
     */
551 2
    public function getUrlsForPageId($pageId)
552
    {
553
        // Get page TSconfig for page ID
554 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
555
556 2
        $res = [];
557
558
        // Fetch Crawler Configuration from pageTSconfig
559 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
560 2
        foreach ($crawlerCfg as $key => $values) {
561 1
            if (!is_array($values)) {
562 1
                continue;
563
            }
564 1
            $key = str_replace('.', '', $key);
565
            // Sub configuration for a single configuration string:
566 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
567 1
            $subCfg['key'] = $key;
568
569 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
570 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
571
            }
572 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
573
574
            // process configuration if it is not page-specific or if the specific page is the current page:
575 1
            if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
576
577
                    // add trailing slash if not present
578 1
                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
579
                    $subCfg['baseUrl'] .= '/';
580
                }
581
582
                // Explode, process etc.:
583 1
                $res[$key] = [];
584 1
                $res[$key]['subCfg'] = $subCfg;
585 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
586 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
587 1
                $res[$key]['origin'] = 'pagets';
588
589
                // recognize MP value
590 1
                if (!$this->MP) {
591 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
592
                } else {
593
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
594
                }
595
            }
596
        }
597
598
        // Get configuration from tx_crawler_configuration records up the rootline
599 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
600 2
        foreach ($crawlerConfigurations as $configurationRecord) {
601
602
                // check access to the configuration record
603 1
            if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
604 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
605
606
                // process configuration if it is not page-specific or if the specific page is the current page:
607 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
608 1
                    $key = $configurationRecord['name'];
609
610
                    // don't overwrite previously defined paramSets
611 1
                    if (!isset($res[$key])) {
612
613
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
614 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
615 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
616
617
                        $subCfg = [
618 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
619 1
                            'procInstrParams.' => $TSparserObject->setup,
620 1
                            'baseUrl' => $this->getBaseUrlForConfigurationRecord(
621 1
                                $configurationRecord['base_url'],
622 1
                                (int)$configurationRecord['sys_domain_base_url'],
623 1
                                (bool)($configurationRecord['force_ssl'] > 0)
624
                            ),
625 1
                            'cHash' => $configurationRecord['chash'],
626 1
                            'userGroups' => $configurationRecord['fegroups'],
627 1
                            'exclude' => $configurationRecord['exclude'],
628 1
                            'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
629 1
                            'key' => $key
630
                        ];
631
632
                        // add trailing slash if not present
633 1
                        if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
634
                            $subCfg['baseUrl'] .= '/';
635
                        }
636 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
637 1
                            $res[$key] = [];
638 1
                            $res[$key]['subCfg'] = $subCfg;
639 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
640 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
641 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
642 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
643
                        }
644
                    }
645
                }
646
            }
647
        }
648
649 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
650
            $params = [
651
                'res' => &$res,
652
            ];
653
            GeneralUtility::callUserFunction($func, $params, $this);
654
        }
655 2
        return $res;
656
    }
657
658
    /**
659
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
660
     *
661
     * @param string $baseUrl
662
     * @param integer $sysDomainUid
663
     * @param bool $ssl
664
     * @return string
665
     */
666 4
    protected function getBaseUrlForConfigurationRecord(string $baseUrl, int $sysDomainUid, bool $ssl = false): string
667
    {
668 4
        if ($sysDomainUid > 0) {
669 2
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('sys_domain');
670
            $domainName = $queryBuilder
671 2
                ->select('domainName')
672 2
                ->from('sys_domain')
673 2
                ->where(
674 2
                    $queryBuilder->expr()->eq('uid', $sysDomainUid)
675
                )
676 2
                ->execute()
677 2
                ->fetchColumn();
678
679 2
            if (!empty($domainName)) {
680 1
                $baseUrl = ($ssl ? 'https' : 'http') . '://' . $domainName;
681
            }
682
        }
683 4
        return $baseUrl;
684
    }
685
686
    /**
687
     * Find all configurations of subpages of a page
688
     *
689
     * @param int $rootid
690
     * @param $depth
691
     * @return array
692
     *
693
     * TODO: Write Functional Tests
694
     */
695
    public function getConfigurationsForBranch(int $rootid, $depth)
696
    {
697
        $configurationsForBranch = [];
698
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
699
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
700
        foreach ($sets as $key => $value) {
701
            if (!is_array($value)) {
702
                continue;
703
            }
704
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
705
        }
706
        $pids = [];
707
        $rootLine = BackendUtility::BEgetRootLine($rootid);
708
        foreach ($rootLine as $node) {
709
            $pids[] = $node['uid'];
710
        }
711
        /* @var PageTreeView $tree */
712
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
713
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
714
        $tree->init('AND ' . $perms_clause);
715
        $tree->getTree($rootid, $depth, '');
716
        foreach ($tree->tree as $node) {
717
            $pids[] = $node['row']['uid'];
718
        }
719
720
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
721
722
        $queryBuilder->getRestrictions()
723
            ->removeAll()
724
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
725
726
        $statement = $queryBuilder
727
            ->select('name')
728
            ->from('tx_crawler_configuration')
729
            ->where(
730
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
731
            )
732
            ->execute();
733
734
        while ($row = $statement->fetch()) {
735
            $configurationsForBranch[] = $row['name'];
736
        }
737
        return $configurationsForBranch;
738
    }
739
740
    /**
741
     * Get querybuilder for given table
742
     *
743
     * @param string $table
744
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
745
     */
746 9
    private function getQueryBuilder(string $table)
747
    {
748 9
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
749
    }
750
751
    /**
752
     * Check if a user has access to an item
753
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
754
     *
755
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
756
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
757
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
758
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
759
     */
760 3
    public function hasGroupAccess($groupList, $accessList)
761
    {
762 3
        if (empty($accessList)) {
763 1
            return true;
764
        }
765 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
766 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
767 1
                return true;
768
            }
769
        }
770 1
        return false;
771
    }
772
773
    /**
774
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
775
     * Syntax of values:
776
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
777
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
778
     * - For each configuration part:
779
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
780
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
781
     *        _ENABLELANG:1 picks only original records without their language overlays
782
     *         - Default: Literal value
783
     *
784
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
785
     * @param integer $pid Current page ID
786
     * @return array
787
     *
788
     * TODO: Write Functional Tests
789
     */
790 2
    public function expandParameters($paramArray, $pid)
791
    {
792
        // Traverse parameter names:
793 2
        foreach ($paramArray as $p => $v) {
794 2
            $v = trim($v);
795
796
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
797 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
798
                // So, find the value inside brackets and reset the paramArray value as an array.
799 2
                $v = substr($v, 1, -1);
800 2
                $paramArray[$p] = [];
801
802
                // Explode parts and traverse them:
803 2
                $parts = explode('|', $v);
804 2
                foreach ($parts as $pV) {
805
806
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
807 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
808
809
                        // Swap if first is larger than last:
810
                        if ($reg[1] > $reg[2]) {
811
                            $temp = $reg[2];
812
                            $reg[2] = $reg[1];
813
                            $reg[1] = $temp;
814
                        }
815
816
                        // Traverse range, add values:
817
                        $runAwayBrake = 1000; // Limit to size of range!
818
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
819
                            $paramArray[$p][] = $a;
820
                            $runAwayBrake--;
821
                            if ($runAwayBrake <= 0) {
822
                                break;
823
                            }
824
                        }
825 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
826
827
                        // Parse parameters:
828
                        $subparts = GeneralUtility::trimExplode(';', $pV);
829
                        $subpartParams = [];
830
                        foreach ($subparts as $spV) {
831
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
832
                            $subpartParams[$pKey] = $pVal;
833
                        }
834
835
                        // Table exists:
836
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
837
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
838
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
839
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
840
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
841
842
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
843
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
844
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
845
846
                                $queryBuilder->getRestrictions()
847
                                    ->removeAll()
848
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
849
850
                                $queryBuilder
851
                                    ->select($fieldName)
852
                                    ->from($subpartParams['_TABLE'])
853
                                    // TODO: Check if this works as intended!
854
                                    ->add('from', $addTable)
855
                                    ->where(
856
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
857
                                        $where
858
                                    );
859
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
860
861
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
862
                                    $queryBuilder->andWhere(
863
                                        $queryBuilder->expr()->lte(
864
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
865
                                            0
866
                                        )
867
                                    );
868
                                }
869
870
                                $statement = $queryBuilder->execute();
871
872
                                $rows = [];
873
                                while ($row = $statement->fetch()) {
874
                                    $rows[$fieldName] = $row;
875
                                }
876
877
                                if (is_array($rows)) {
878
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
879
                                }
880
                            }
881
                        }
882
                    } else { // Just add value:
883 2
                        $paramArray[$p][] = $pV;
884
                    }
885
                    // Hook for processing own expandParameters place holder
886 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
887
                        $_params = [
888
                            'pObj' => &$this,
889
                            'paramArray' => &$paramArray,
890
                            'currentKey' => $p,
891
                            'currentValue' => $pV,
892
                            'pid' => $pid
893
                        ];
894
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
895
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
896
                        }
897
                    }
898
                }
899
900
                // Make unique set of values and sort array by key:
901 2
                $paramArray[$p] = array_unique($paramArray[$p]);
902 2
                ksort($paramArray);
903
            } else {
904
                // Set the literal value as only value in array:
905 2
                $paramArray[$p] = [$v];
906
            }
907
        }
908
909 2
        return $paramArray;
910
    }
911
912
    /**
913
     * Compiling URLs from parameter array (output of expandParameters())
914
     * The number of URLs will be the multiplication of the number of parameter values for each key
915
     *
916
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
917
     * @param array $urls URLs accumulated in this array (for recursion)
918
     * @return array
919
     */
920 5
    public function compileUrls($paramArray, array $urls)
921
    {
922 5
        if (empty($paramArray)) {
923 5
            return $urls;
924
        }
925
        // shift first off stack:
926 4
        reset($paramArray);
927 4
        $varName = key($paramArray);
928 4
        $valueSet = array_shift($paramArray);
929
930
        // Traverse value set:
931 4
        $newUrls = [];
932 4
        foreach ($urls as $url) {
933 3
            foreach ($valueSet as $val) {
934 3
                $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
935
936 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
937
                    break;
938
                }
939
            }
940
        }
941 4
        return $this->compileUrls($paramArray, $newUrls);
942
    }
943
944
    /************************************
945
     *
946
     * Crawler log
947
     *
948
     ************************************/
949
950
    /**
951
     * Return array of records from crawler queue for input page ID
952
     *
953
     * @param integer $id Page ID for which to look up log entries.
954
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
955
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
956
     * @param boolean $doFullFlush
957
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
958
     * @return array
959
     */
960 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
961
    {
962 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
963
        $queryBuilder
964 4
            ->select('*')
965 4
            ->from($this->tableName)
966 4
            ->where(
967 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
968
            )
969 4
            ->orderBy('scheduled', 'DESC');
970
971 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
972 4
            ->getConnectionForTable($this->tableName)
973 4
            ->getExpressionBuilder();
974 4
        $query = $expressionBuilder->andX();
975
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
976
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
977
        // between the statements, it's not a mistake in the code.
978 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
979 4
        switch ($filter) {
980 4
            case 'pending':
981
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
982
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
983
                break;
984 4
            case 'finished':
985
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
986
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
987
                break;
988
        }
989
990
        // FIXME: Write unit test that ensures that the right records are deleted.
991 4
        if ($doFlush) {
992 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
993 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
994 2
            return [];
995
        } else {
996 2
            if ($itemsPerPage > 0) {
997
                $queryBuilder
998 2
                    ->setMaxResults((int)$itemsPerPage);
999
            }
1000
1001 2
            return $queryBuilder->execute()->fetchAll();
1002
        }
1003
    }
1004
1005
    /**
1006
     * Return array of records from crawler queue for input set ID
1007
     *
1008
     * @param integer $set_id Set ID for which to look up log entries.
1009
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1010
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1011
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1012
     * @return array
1013
     */
1014 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1015
    {
1016 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1017
        $queryBuilder
1018 6
            ->select('*')
1019 6
            ->from($this->tableName)
1020 6
            ->where(
1021 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1022
            )
1023 6
            ->orderBy('scheduled', 'DESC');
1024
1025 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1026 6
            ->getConnectionForTable($this->tableName)
1027 6
            ->getExpressionBuilder();
1028 6
        $query = $expressionBuilder->andX();
1029
        // FIXME: Write Unit tests for Filters
1030
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1031
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1032
        // between the statements, it's not a mistake in the code.
1033 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1034 6
        switch ($filter) {
1035 6
            case 'pending':
1036 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1037 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1038 1
                break;
1039 5
            case 'finished':
1040 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1041 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1042 1
                break;
1043
        }
1044
        // FIXME: Write unit test that ensures that the right records are deleted.
1045 6
        if ($doFlush) {
1046 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1047 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1048 4
            return [];
1049
        } else {
1050 2
            if ($itemsPerPage > 0) {
1051
                $queryBuilder
1052 2
                    ->setMaxResults((int)$itemsPerPage);
1053
            }
1054
1055 2
            return $queryBuilder->execute()->fetchAll();
1056
        }
1057
    }
1058
1059
    /**
1060
     * Removes queue entries
1061
     *
1062
     * @param string $where SQL related filter for the entries which should be removed
1063
     * @return void
1064
     */
1065 9
    protected function flushQueue($where = '')
1066
    {
1067 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1068
1069 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1070
1071 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1072
            $groups = $queryBuilder
1073
                ->select('DISTINCT set_id')
1074
                ->from($this->tableName)
1075
                ->where($realWhere)
1076
                ->execute()
1077
                ->fetchAll();
1078
            if (is_array($groups)) {
1079
                foreach ($groups as $group) {
1080
                    $subSet = $queryBuilder
1081
                        ->select('uid', 'set_id')
1082
                        ->from($this->tableName)
1083
                        ->where(
1084
                            $realWhere,
1085
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1086
                        )
1087
                        ->execute()
1088
                        ->fetchAll();
1089
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1090
                }
1091
            }
1092
        }
1093
1094
        $queryBuilder
1095 9
            ->delete($this->tableName)
1096 9
            ->where($realWhere)
1097 9
            ->execute();
1098 9
    }
1099
1100
    /**
1101
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1102
     *
1103
     * @param integer $setId Set ID
1104
     * @param array $params Parameters to pass to call back function
1105
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1106
     * @param integer $page_id Page ID to attach it to
1107
     * @param integer $schedule Time at which to activate
1108
     * @return void
1109
     */
1110
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1111
    {
1112
        if (!is_array($params)) {
1113
            $params = [];
1114
        }
1115
        $params['_CALLBACKOBJ'] = $callBack;
1116
1117
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1118
            ->insert(
1119
                'tx_crawler_queue',
1120
                [
1121
                    'page_id' => intval($page_id),
1122
                    'parameters' => serialize($params),
1123
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1124
                    'exec_time' => 0,
1125
                    'set_id' => intval($setId),
1126
                    'result_data' => '',
1127
                ]
1128
            );
1129
    }
1130
1131
    /************************************
1132
     *
1133
     * URL setting
1134
     *
1135
     ************************************/
1136
1137
    /**
1138
     * Setting a URL for crawling:
1139
     *
1140
     * @param integer $id Page ID
1141
     * @param string $url Complete URL
1142
     * @param array $subCfg Sub configuration array (from TS config)
1143
     * @param integer $tstamp Scheduled-time
1144
     * @param string $configurationHash (optional) configuration hash
1145
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1146
     * @return bool
1147
     */
1148 2
    public function addUrl(
1149
        $id,
1150
        $url,
1151
        array $subCfg,
1152
        $tstamp,
1153
        $configurationHash = '',
1154
        $skipInnerDuplicationCheck = false
1155
    ) {
1156 2
        $urlAdded = false;
1157 2
        $rows = [];
1158
1159
        // Creating parameters:
1160
        $parameters = [
1161 2
            'url' => $url
1162
        ];
1163
1164
        // fe user group simulation:
1165 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1166 2
        if ($uGs) {
1167
            $parameters['feUserGroupList'] = $uGs;
1168
        }
1169
1170
        // Setting processing instructions
1171 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1172 2
        if (is_array($subCfg['procInstrParams.'])) {
1173 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1174
        }
1175
1176
        // Possible TypoScript Template Parents
1177 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1178
1179
        // Compile value array:
1180 2
        $parameters_serialized = serialize($parameters);
1181
        $fieldArray = [
1182 2
            'page_id' => intval($id),
1183 2
            'parameters' => $parameters_serialized,
1184 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1185 2
            'configuration_hash' => $configurationHash,
1186 2
            'scheduled' => $tstamp,
1187 2
            'exec_time' => 0,
1188 2
            'set_id' => intval($this->setID),
1189 2
            'result_data' => '',
1190 2
            'configuration' => $subCfg['key'],
1191
        ];
1192
1193 2
        if ($this->registerQueueEntriesInternallyOnly) {
1194
            //the entries will only be registered and not stored to the database
1195
            $this->queueEntries[] = $fieldArray;
1196
        } else {
1197 2
            if (!$skipInnerDuplicationCheck) {
1198
                // check if there is already an equal entry
1199 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1200
            }
1201
1202 2
            if (empty($rows)) {
1203 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1204 2
                $connectionForCrawlerQueue->insert(
1205 2
                    'tx_crawler_queue',
1206 2
                    $fieldArray
1207
                );
1208 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1209 2
                $rows[] = $uid;
1210 2
                $urlAdded = true;
1211 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1212
            } else {
1213
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1214
            }
1215
        }
1216
1217 2
        return $urlAdded;
1218
    }
1219
1220
    /**
1221
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1222
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1223
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1224
     *
1225
     * @param int $tstamp
1226
     * @param array $fieldArray
1227
     *
1228
     * @return array
1229
     *
1230
     * TODO: Write Functional Tests
1231
     */
1232 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1233
    {
1234 2
        $rows = [];
1235
1236 2
        $currentTime = $this->getCurrentTime();
1237
1238 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1239
        $queryBuilder
1240 2
            ->select('qid')
1241 2
            ->from('tx_crawler_queue');
1242
        //if this entry is scheduled with "now"
1243 2
        if ($tstamp <= $currentTime) {
1244
            if ($this->extensionSettings['enableTimeslot']) {
1245
                $timeBegin = $currentTime - 100;
1246
                $timeEnd = $currentTime + 100;
1247
                $queryBuilder
1248
                    ->where(
1249
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1250
                    )
1251
                    ->orWhere(
1252
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1253
                    );
1254
            } else {
1255
                $queryBuilder
1256
                    ->where(
1257
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1258
                    );
1259
            }
1260 2
        } elseif ($tstamp > $currentTime) {
1261
            //entry with a timestamp in the future need to have the same schedule time
1262
            $queryBuilder
1263 2
                ->where(
1264 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1265
                );
1266
        }
1267
1268
        $statement = $queryBuilder
1269 2
            ->andWhere('exec_time != 0')
1270 2
            ->andWhere('process_id != 0')
1271 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1272 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1273 2
            ->execute();
1274
1275 2
        while ($row = $statement->fetch()) {
1276
            $rows[] = $row['qid'];
1277
        }
1278
1279 2
        return $rows;
1280
    }
1281
1282
    /**
1283
     * Returns the current system time
1284
     *
1285
     * @return int
1286
     */
1287
    public function getCurrentTime()
1288
    {
1289
        return time();
1290
    }
1291
1292
    /************************************
1293
     *
1294
     * URL reading
1295
     *
1296
     ************************************/
1297
1298
    /**
1299
     * Read URL for single queue entry
1300
     *
1301
     * @param integer $queueId
1302
     * @param boolean $force If set, will process even if exec_time has been set!
1303
     * @return integer
1304
     */
1305
    public function readUrl($queueId, $force = false)
1306
    {
1307
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1308
        $ret = 0;
1309
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1310
        // Get entry:
1311
        $queryBuilder
1312
            ->select('*')
1313
            ->from('tx_crawler_queue')
1314
            ->where(
1315
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1316
            );
1317
        if (!$force) {
1318
            $queryBuilder
1319
                ->andWhere('exec_time = 0')
1320
                ->andWhere('process_scheduled > 0');
1321
        }
1322
        $queueRec = $queryBuilder->execute()->fetch();
1323
1324
        if (!is_array($queueRec)) {
1325
            return;
1326
        }
1327
1328
        $parameters = unserialize($queueRec['parameters']);
1329
        if ($parameters['rootTemplatePid']) {
1330
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1331
        } else {
1332
            $this->logger->warning(
1333
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1334
            );
1335
        }
1336
1337
        SignalSlotUtility::emitSignal(
1338
            __CLASS__,
1339
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1340
            [$queueId, &$queueRec]
1341
        );
1342
1343
        // Set exec_time to lock record:
1344
        $field_array = ['exec_time' => $this->getCurrentTime()];
1345
1346
        if (isset($this->processID)) {
1347
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1348
            $field_array['process_id_completed'] = $this->processID;
1349
        }
1350
1351
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1352
            ->update(
1353
                'tx_crawler_queue',
1354
                $field_array,
1355
                [ 'qid' => (int)$queueId ]
1356
            );
1357
1358
        $result = $this->readUrl_exec($queueRec);
1359
        $resultData = unserialize($result['content']);
1360
1361
        //atm there's no need to point to specific pollable extensions
1362
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1363
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1364
                // only check the success value if the instruction is runnig
1365
                // it is important to name the pollSuccess key same as the procInstructions key
1366
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1367
                    $pollable,
1368
                    $resultData['parameters']['procInstructions']
1369
                )
1370
                ) {
1371
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1372
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1373
                    }
1374
                }
1375
            }
1376
        }
1377
1378
        // Set result in log which also denotes the end of the processing of this entry.
1379
        $field_array = ['result_data' => serialize($result)];
1380
1381
        SignalSlotUtility::emitSignal(
1382
            __CLASS__,
1383
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1384
            [$queueId, &$field_array]
1385
        );
1386
1387
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1388
            ->update(
1389
                'tx_crawler_queue',
1390
                $field_array,
1391
                [ 'qid' => (int)$queueId ]
1392
            );
1393
1394
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1395
        return $ret;
1396
    }
1397
1398
    /**
1399
     * Read URL for not-yet-inserted log-entry
1400
     *
1401
     * @param array $field_array Queue field array,
1402
     *
1403
     * @return string
1404
     */
1405
    public function readUrlFromArray($field_array)
1406
    {
1407
1408
            // Set exec_time to lock record:
1409
        $field_array['exec_time'] = $this->getCurrentTime();
1410
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1411
        $connectionForCrawlerQueue->insert(
1412
            'tx_crawler_queue',
1413
            $field_array
1414
        );
1415
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1416
1417
        $result = $this->readUrl_exec($field_array);
1418
1419
        // Set result in log which also denotes the end of the processing of this entry.
1420
        $field_array = ['result_data' => serialize($result)];
1421
1422
        SignalSlotUtility::emitSignal(
1423
            __CLASS__,
1424
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1425
            [$queueId, &$field_array]
1426
        );
1427
1428
        $connectionForCrawlerQueue->update(
1429
            'tx_crawler_queue',
1430
            $field_array,
1431
            ['qid' => $queueId]
1432
        );
1433
1434
        return $result;
1435
    }
1436
1437
    /**
1438
     * Read URL for a queue record
1439
     *
1440
     * @param array $queueRec Queue record
1441
     * @return string
1442
     */
1443
    public function readUrl_exec($queueRec)
1444
    {
1445
        // Decode parameters:
1446
        $parameters = unserialize($queueRec['parameters']);
1447
        $result = 'ERROR';
1448
        if (is_array($parameters)) {
1449
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1450
                $objRef = $parameters['_CALLBACKOBJ'];
1451
                $callBackObj = GeneralUtility::makeInstance($objRef);
1452
                if (is_object($callBackObj)) {
1453
                    unset($parameters['_CALLBACKOBJ']);
1454
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1455
                } else {
1456
                    $result = ['content' => 'No object: ' . $objRef];
1457
                }
1458
            } else { // Regular FE request:
1459
1460
                // Prepare:
1461
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1462
1463
                // Get result:
1464
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1465
1466
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1467
            }
1468
        }
1469
1470
        return $result;
1471
    }
1472
1473
    /**
1474
     * Gets the content of a URL.
1475
     *
1476
     * @param string $originalUrl URL to read
1477
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1478
     * @param integer $timeout Timeout time
1479
     * @param integer $recursion Recursion limiter for 302 redirects
1480
     * @return array|boolean
1481
     */
1482 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1483
    {
1484 2
        if (!$recursion) {
1485
            return false;
1486
        }
1487
1488
        // Parse URL, checking for scheme:
1489 2
        $url = parse_url($originalUrl);
1490
1491 2
        if ($url === false) {
1492
            $this->logger->debug(
1493
                sprintf('Could not parse_url() for string "%s"', $url),
1494
                ['crawlerId' => $crawlerId]
1495
            );
1496
            return false;
1497
        }
1498
1499 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1500
            $this->logger->debug(
1501
                sprintf('Scheme does not match for url "%s"', $url),
1502
                ['crawlerId' => $crawlerId]
1503
            );
1504
            return false;
1505
        }
1506
1507
        // direct request
1508 2
        if ($this->extensionSettings['makeDirectRequests']) {
1509 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1510 2
            return $result;
1511
        }
1512
1513
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1514
1515
        // thanks to Pierrick Caillon for adding proxy support
1516
        $rurl = $url;
1517
1518
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1519
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1520
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1521
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1522
        }
1523
1524
        $host = $rurl['host'];
1525
1526
        if ($url['scheme'] == 'https') {
1527
            $host = 'ssl://' . $host;
1528
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1529
        } else {
1530
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1531
        }
1532
1533
        $startTime = microtime(true);
1534
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1535
1536
        if (!$fp) {
1537
            $this->logger->debug(
1538
                sprintf('Error while opening "%s"', $url),
1539
                ['crawlerId' => $crawlerId]
1540
            );
1541
            return false;
1542
        } else {
1543
            // Request message:
1544
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1545
            fputs($fp, $msg);
1546
1547
            // Read response:
1548
            $d = $this->getHttpResponseFromStream($fp);
1549
            fclose($fp);
1550
1551
            $time = microtime(true) - $startTime;
1552
            $this->logger->info($originalUrl . ' ' . $time);
1553
1554
            // Implode content and headers:
1555
            $result = [
1556
                'request' => $msg,
1557
                'headers' => implode('', $d['headers']),
1558
                'content' => implode('', (array)$d['content'])
1559
            ];
1560
1561
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1562
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1563
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1564
1565
                if (is_array($newRequestUrl)) {
1566
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1567
                } else {
1568
                    $this->logger->debug(
1569
                        sprintf('Error while opening "%s"', $url),
1570
                        ['crawlerId' => $crawlerId]
1571
                    );
1572
                    return false;
1573
                }
1574
            }
1575
1576
            return $result;
1577
        }
1578
    }
1579
1580
    /**
1581
     * Gets the base path of the website frontend.
1582
     * (e.g. if you call http://mydomain.com/cms/index.php in
1583
     * the browser the base path is "/cms/")
1584
     *
1585
     * @return string Base path of the website frontend
1586
     */
1587
    protected function getFrontendBasePath()
1588
    {
1589
        $frontendBasePath = '/';
1590
1591
        // Get the path from the extension settings:
1592
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1593
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1594
        // If empty, try to use config.absRefPrefix:
1595
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1596
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1597
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1598
        } elseif (!Environment::isCli()) {
1599
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1600
        }
1601
1602
        // Base path must be '/<pathSegements>/':
1603
        if ($frontendBasePath !== '/') {
1604
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1605
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1606
        }
1607
1608
        return $frontendBasePath;
1609
    }
1610
1611
    /**
1612
     * Executes a shell command and returns the outputted result.
1613
     *
1614
     * @param string $command Shell command to be executed
1615
     * @return string Outputted result of the command execution
1616
     */
1617
    protected function executeShellCommand($command)
1618
    {
1619
        return shell_exec($command);
1620
    }
1621
1622
    /**
1623
     * Reads HTTP response from the given stream.
1624
     *
1625
     * @param  resource $streamPointer  Pointer to connection stream.
1626
     * @return array                    Associative array with the following items:
1627
     *                                  headers <array> Response headers sent by server.
1628
     *                                  content <array> Content, with each line as an array item.
1629
     */
1630 1
    protected function getHttpResponseFromStream($streamPointer)
1631
    {
1632 1
        $response = ['headers' => [], 'content' => []];
1633
1634 1
        if (is_resource($streamPointer)) {
1635
            // read headers
1636 1
            while ($line = fgets($streamPointer, '2048')) {
1637 1
                $line = trim($line);
1638 1
                if ($line !== '') {
1639 1
                    $response['headers'][] = $line;
1640
                } else {
1641 1
                    break;
1642
                }
1643
            }
1644
1645
            // read content
1646 1
            while ($line = fgets($streamPointer, '2048')) {
1647 1
                $response['content'][] = $line;
1648
            }
1649
        }
1650
1651 1
        return $response;
1652
    }
1653
1654
    /**
1655
     * Builds HTTP request headers.
1656
     *
1657
     * @param array $url
1658
     * @param string $crawlerId
1659
     *
1660
     * @return array
1661
     */
1662 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1663
    {
1664 6
        $reqHeaders = [];
1665 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1666 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1667 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1668 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1669
        }
1670 6
        $reqHeaders[] = 'Connection: close';
1671 6
        if ($url['user'] != '') {
1672 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1673
        }
1674 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1675 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1676 6
        return $reqHeaders;
1677
    }
1678
1679
    /**
1680
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1681
     *
1682
     * @param array $headers HTTP Header
1683
     * @param string $user HTTP Auth. User
1684
     * @param string $pass HTTP Auth. Password
1685
     * @return bool|string
1686
     */
1687 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1688
    {
1689 12
        $header = [];
1690 12
        if (!is_array($headers)) {
1691 1
            return false;
1692
        }
1693 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1694 2
            return false;
1695
        }
1696
1697 9
        foreach ($headers as $hl) {
1698 9
            $tmp = explode(": ", $hl);
1699 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1700 9
            if (trim($tmp[0]) == 'Location') {
1701 6
                break;
1702
            }
1703
        }
1704 9
        if (!array_key_exists('Location', $header)) {
1705 3
            return false;
1706
        }
1707
1708 6
        if ($user != '') {
1709 3
            if (!($tmp = parse_url($header['Location']))) {
1710 1
                return false;
1711
            }
1712 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1713 2
            if ($tmp['query'] != '') {
1714 2
                $newUrl .= '?' . $tmp['query'];
1715
            }
1716
        } else {
1717 3
            $newUrl = $header['Location'];
1718
        }
1719 5
        return $newUrl;
1720
    }
1721
1722
    /**************************
1723
     *
1724
     * tslib_fe hooks:
1725
     *
1726
     **************************/
1727
1728
    /**
1729
     * Initialization hook (called after database connection)
1730
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1731
     *
1732
     * @param array $params Parameters from frontend
1733
     * @param object $ref TSFE object (reference under PHP5)
1734
     * @return void
1735
     *
1736
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1737
     * FIXME: I think this can be removed. (TNM)
1738
     */
1739
    public function fe_init(&$params, $ref)
1740
    {
1741
        // Authenticate crawler request:
1742
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1743
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1744
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1745
1746
            $queueRec = $queryBuilder
1747
                ->select('*')
1748
                ->from('tx_crawler_queue')
1749
                ->where(
1750
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1751
                )
1752
                ->execute()
1753
                ->fetch();
1754
1755
            // If a crawler record was found and hash was matching, set it up:
1756
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1757
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1758
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1759
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1760
            } else {
1761
                die('No crawler entry found!');
1762
            }
1763
        }
1764
    }
1765
1766
    /*****************************
1767
     *
1768
     * Compiling URLs to crawl - tools
1769
     *
1770
     *****************************/
1771
1772
    /**
1773
     * @param integer $id Root page id to start from.
1774
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1775
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1776
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1777
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1778
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1779
     * @param array $incomingProcInstructions Array of processing instructions
1780
     * @param array $configurationSelection Array of configuration keys
1781
     * @return string
1782
     */
1783
    public function getPageTreeAndUrls(
1784
        $id,
1785
        $depth,
1786
        $scheduledTime,
1787
        $reqMinute,
1788
        $submitCrawlUrls,
1789
        $downloadCrawlUrls,
1790
        array $incomingProcInstructions,
1791
        array $configurationSelection
1792
    ) {
1793
        $this->scheduledTime = $scheduledTime;
1794
        $this->reqMinute = $reqMinute;
1795
        $this->submitCrawlUrls = $submitCrawlUrls;
1796
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1797
        $this->incomingProcInstructions = $incomingProcInstructions;
1798
        $this->incomingConfigurationSelection = $configurationSelection;
1799
1800
        $this->duplicateTrack = [];
1801
        $this->downloadUrls = [];
1802
1803
        // Drawing tree:
1804
        /* @var PageTreeView $tree */
1805
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1806
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1807
        $tree->init('AND ' . $perms_clause);
1808
1809
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1810
        if (is_array($pageInfo)) {
1811
            // Set root row:
1812
            $tree->tree[] = [
1813
                'row' => $pageInfo,
1814
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1815
            ];
1816
        }
1817
1818
        // Get branch beneath:
1819
        if ($depth) {
1820
            $tree->getTree($id, $depth, '');
1821
        }
1822
1823
        // Traverse page tree:
1824
        $code = '';
1825
1826
        foreach ($tree->tree as $data) {
1827
            $this->MP = false;
1828
1829
            // recognize mount points
1830
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1831
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1832
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1833
                $mountpage = $queryBuilder
1834
                    ->select('*')
1835
                    ->from('pages')
1836
                    ->where(
1837
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1838
                    )
1839
                    ->execute()
1840
                    ->fetchAll();
1841
                $queryBuilder->getRestrictions()->reset();
1842
1843
                // fetch mounted pages
1844
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1845
1846
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1847
                $mountTree->init('AND ' . $perms_clause);
1848
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1849
1850
                foreach ($mountTree->tree as $mountData) {
1851
                    $code .= $this->drawURLs_addRowsForPage(
1852
                        $mountData['row'],
1853
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1854
                    );
1855
                }
1856
1857
                // replace page when mount_pid_ol is enabled
1858
                if ($mountpage[0]['mount_pid_ol']) {
1859
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1860
                } else {
1861
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1862
                    $this->MP = false;
1863
                }
1864
            }
1865
1866
            $code .= $this->drawURLs_addRowsForPage(
1867
                $data['row'],
1868
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1869
            );
1870
        }
1871
1872
        return $code;
1873
    }
1874
1875
    /**
1876
     * Expands exclude string
1877
     *
1878
     * @param string $excludeString Exclude string
1879
     * @return array
1880
     */
1881 1
    public function expandExcludeString($excludeString)
1882
    {
1883
        // internal static caches;
1884 1
        static $expandedExcludeStringCache;
1885 1
        static $treeCache;
1886
1887 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1888 1
            $pidList = [];
1889
1890 1
            if (!empty($excludeString)) {
1891
                /** @var PageTreeView $tree */
1892
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1893
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1894
1895
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1896
1897
                foreach ($excludeParts as $excludePart) {
1898
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1899
1900
                    // default is "page only" = "depth=0"
1901
                    if (empty($depth)) {
1902
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1903
                    }
1904
1905
                    $pidList[] = $pid;
1906
1907
                    if ($depth > 0) {
1908
                        if (empty($treeCache[$pid][$depth])) {
1909
                            $tree->reset();
1910
                            $tree->getTree($pid, $depth);
1911
                            $treeCache[$pid][$depth] = $tree->tree;
1912
                        }
1913
1914
                        foreach ($treeCache[$pid][$depth] as $data) {
1915
                            $pidList[] = $data['row']['uid'];
1916
                        }
1917
                    }
1918
                }
1919
            }
1920
1921 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1922
        }
1923
1924 1
        return $expandedExcludeStringCache[$excludeString];
1925
    }
1926
1927
    /**
1928
     * Create the rows for display of the page tree
1929
     * For each page a number of rows are shown displaying GET variable configuration
1930
     *
1931
     * @param    array        Page row
1932
     * @param    string        Page icon and title for row
1933
     * @return    string        HTML <tr> content (one or more)
1934
     */
1935
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1936
    {
1937
        $skipMessage = '';
1938
1939
        // Get list of configurations
1940
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1941
1942
        if (!empty($this->incomingConfigurationSelection)) {
1943
            // remove configuration that does not match the current selection
1944
            foreach ($configurations as $confKey => $confArray) {
1945
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1946
                    unset($configurations[$confKey]);
1947
                }
1948
            }
1949
        }
1950
1951
        // Traverse parameter combinations:
1952
        $c = 0;
1953
        $content = '';
1954
        if (!empty($configurations)) {
1955
            foreach ($configurations as $confKey => $confArray) {
1956
1957
                    // Title column:
1958
                if (!$c) {
1959
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1960
                } else {
1961
                    $titleClm = '';
1962
                }
1963
1964
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1965
1966
                        // URL list:
1967
                    $urlList = $this->urlListFromUrlArray(
1968
                        $confArray,
1969
                        $pageRow,
1970
                        $this->scheduledTime,
1971
                        $this->reqMinute,
1972
                        $this->submitCrawlUrls,
1973
                        $this->downloadCrawlUrls,
1974
                        $this->duplicateTrack,
1975
                        $this->downloadUrls,
1976
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1977
                    );
1978
1979
                    // Expanded parameters:
1980
                    $paramExpanded = '';
1981
                    $calcAccu = [];
1982
                    $calcRes = 1;
1983
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1984
                        $paramExpanded .= '
1985
                            <tr>
1986
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1987
                                                '(' . count($gVal) . ')' .
1988
                                                '</td>
1989
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1990
                            </tr>
1991
                        ';
1992
                        $calcRes *= count($gVal);
1993
                        $calcAccu[] = count($gVal);
1994
                    }
1995
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1996
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1997
1998
                    // Options
1999
                    $optionValues = '';
2000
                    if ($confArray['subCfg']['userGroups']) {
2001
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2002
                    }
2003
                    if ($confArray['subCfg']['baseUrl']) {
2004
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2005
                    }
2006
                    if ($confArray['subCfg']['procInstrFilter']) {
2007
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2008
                    }
2009
2010
                    // Compile row:
2011
                    $content .= '
2012
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2013
                            ' . $titleClm . '
2014
                            <td>' . htmlspecialchars($confKey) . '</td>
2015
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2016
                            <td>' . $paramExpanded . '</td>
2017
                            <td nowrap="nowrap">' . $urlList . '</td>
2018
                            <td nowrap="nowrap">' . $optionValues . '</td>
2019
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2020
                        </tr>';
2021
                } else {
2022
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2023
                            ' . $titleClm . '
2024
                            <td>' . htmlspecialchars($confKey) . '</td>
2025
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2026
                        </tr>';
2027
                }
2028
2029
                $c++;
2030
            }
2031
        } else {
2032
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2033
2034
            // Compile row:
2035
            $content .= '
2036
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2037
                    <td>' . $pageTitleAndIcon . '</td>
2038
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2039
                </tr>';
2040
        }
2041
2042
        return $content;
2043
    }
2044
2045
    /*****************************
2046
     *
2047
     * CLI functions
2048
     *
2049
     *****************************/
2050
2051
    /**
2052
     * Running the functionality of the CLI (crawling URLs from queue)
2053
     *
2054
     * @param int $countInARun
2055
     * @param int $sleepTime
2056
     * @param int $sleepAfterFinish
2057
     * @return string
2058
     */
2059
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2060
    {
2061
        $result = 0;
2062
        $counter = 0;
2063
2064
        // First, run hooks:
2065
        $this->CLI_runHooks();
2066
2067
        // Clean up the queue
2068
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2069
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2070
2071
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2072
            $del = $queryBuilderDelete
2073
                ->delete($this->tableName)
2074
                ->where(
2075
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2076
                )->execute();
2077
2078
            if (false === $del) {
2079
                $this->logger->info(
2080
                    'Records could not be deleted.'
2081
                );
2082
            }
2083
        }
2084
2085
        // Select entries:
2086
        //TODO Shouldn't this reside within the transaction?
2087
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2088
        $rows = $queryBuilderSelect
2089
            ->select('qid', 'scheduled')
2090
            ->from('tx_crawler_queue')
2091
            ->where(
2092
                $queryBuilderSelect->expr()->eq('exec_time', 0),
2093
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
2094
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
2095
            )
2096
            ->orderBy('scheduled')
2097
            ->addOrderBy('qid')
2098
            ->setMaxResults($countInARun)
2099
            ->execute()
2100
            ->fetchAll();
2101
2102
        if (!empty($rows)) {
2103
            $quidList = [];
2104
2105
            foreach ($rows as $r) {
2106
                $quidList[] = $r['qid'];
2107
            }
2108
2109
            $processId = $this->CLI_buildProcessId();
2110
2111
            //reserve queue entries for process
2112
2113
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2114
            //TODO make sure we're not taking assigned queue-entires
2115
2116
            //save the number of assigned queue entrys to determine who many have been processed later
2117
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2118
            $numberOfAffectedRows = $queryBuilderUpdate
2119
                ->update('tx_crawler_queue')
2120
                ->where(
2121
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
2122
                )
2123
                ->set('process_scheduled', $this->getCurrentTime())
2124
                ->set('process_id', $queryBuilderUpdate->createNamedParameter($processId, \PDO::PARAM_STR))
2125
                ->execute();
2126
2127
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2128
                ->update(
2129
                    'tx_crawler_process',
2130
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2131
                    [ 'process_id' => (int) $processId ]
2132
                );
2133
2134
            if ($numberOfAffectedRows == count($quidList)) {
2135
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2136
            } else {
2137
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2138
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2139
                return ($result | self::CLI_STATUS_ABORTED);
2140
            }
2141
2142
            foreach ($rows as $r) {
2143
                $result |= $this->readUrl($r['qid']);
2144
2145
                $counter++;
2146
                usleep(intval($sleepTime)); // Just to relax the system
2147
2148
                // if during the start and the current read url the cli has been disable we need to return from the function
2149
                // mark the process NOT as ended.
2150
                if ($this->getDisabled()) {
2151
                    return ($result | self::CLI_STATUS_ABORTED);
2152
                }
2153
2154
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2155
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2156
2157
                    //TODO might need an additional returncode
2158
                    $result |= self::CLI_STATUS_ABORTED;
2159
                    break; //possible timeout
2160
                }
2161
            }
2162
2163
            sleep(intval($sleepAfterFinish));
2164
2165
            $msg = 'Rows: ' . $counter;
2166
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2167
        } else {
2168
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2169
        }
2170
2171
        if ($counter > 0) {
2172
            $result |= self::CLI_STATUS_PROCESSED;
2173
        }
2174
2175
        return $result;
2176
    }
2177
2178
    /**
2179
     * Activate hooks
2180
     *
2181
     * @return void
2182
     */
2183
    public function CLI_runHooks()
2184
    {
2185
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
2186
            $hookObj = GeneralUtility::makeInstance($objRef);
2187
            if (is_object($hookObj)) {
2188
                $hookObj->crawler_init($this);
2189
            }
2190
        }
2191
    }
2192
2193
    /**
2194
     * Try to acquire a new process with the given id
2195
     * also performs some auto-cleanup for orphan processes
2196
     * @todo preemption might not be the most elegant way to clean up
2197
     *
2198
     * @param string $id identification string for the process
2199
     * @return boolean
2200
     */
2201
    public function CLI_checkAndAcquireNewProcess($id)
2202
    {
2203
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2204
        $ret = true;
2205
2206
        $systemProcessId = getmypid();
2207
        if ($systemProcessId < 1) {
2208
            return false;
2209
        }
2210
2211
        $processCount = 0;
2212
        $orphanProcesses = [];
2213
2214
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2215
2216
        $statement = $queryBuilder
2217
            ->select('process_id', 'ttl')
2218
            ->from('tx_crawler_process')
2219
            ->where(
2220
                'active = 1 AND deleted = 0'
2221
            )
2222
            ->execute();
2223
2224
        $currentTime = $this->getCurrentTime();
2225
2226
        while ($row = $statement->fetch()) {
2227
            if ($row['ttl'] < $currentTime) {
2228
                $orphanProcesses[] = $row['process_id'];
2229
            } else {
2230
                $processCount++;
2231
            }
2232
        }
2233
2234
        // if there are less than allowed active processes then add a new one
2235
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2236
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2237
2238
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2239
                'tx_crawler_process',
2240
                [
2241
                    'process_id' => $id,
2242
                    'active' => 1,
2243
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2244
                    'system_process_id' => $systemProcessId
2245
                ]
2246
            );
2247
        } else {
2248
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2249
            $ret = false;
2250
        }
2251
2252
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2253
        $this->processRepository->deleteProcessesWithoutItemsAssigned();
2254
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2255
2256
        return $ret;
2257
    }
2258
2259
    /**
2260
     * Release a process and the required resources
2261
     *
2262
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2263
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2264
     * @return boolean
2265
     */
2266
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2267
    {
2268
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2269
2270
        if (!is_array($releaseIds)) {
2271
            $releaseIds = [$releaseIds];
2272
        }
2273
2274
        if (empty($releaseIds)) {
2275
            return false;   //nothing to release
2276
        }
2277
2278
        if (!$withinLock) {
2279
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2280
        }
2281
2282
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2283
        // this ensures that a single process can't mess up the entire process table
2284
2285
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2286
2287
        $queryBuilder
2288
        ->update('tx_crawler_queue', 'q')
2289
        ->where(
2290
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2291
        )
2292
        ->set('q.process_scheduled', 0)
2293
        ->set('q.process_id', '')
2294
        ->execute();
2295
2296
        // FIXME: Not entirely sure that this is equivalent to the previous version
2297
        $queryBuilder->resetQueryPart('set');
2298
2299
        $queryBuilder
2300
            ->update('tx_crawler_process')
2301
            ->where(
2302
                $queryBuilder->expr()->eq('active', 0),
2303
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2304
            )
2305
            ->set('system_process_id', 0)
2306
            ->execute();
2307
        // previous version for reference
2308
        /*
2309
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2310
            'tx_crawler_process',
2311
            'active=0 AND deleted=0
2312
            AND NOT EXISTS (
2313
                SELECT * FROM tx_crawler_queue
2314
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2315
                AND tx_crawler_queue.exec_time = 0
2316
            )',
2317
            [
2318
                'deleted' => '1',
2319
                'system_process_id' => 0
2320
            ]
2321
        );*/
2322
        // mark all requested processes as non-active
2323
        $queryBuilder
2324
            ->update('tx_crawler_process')
2325
            ->where(
2326
                'NOT EXISTS (
2327
                SELECT * FROM tx_crawler_queue
2328
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2329
                    AND tx_crawler_queue.exec_time = 0
2330
                )',
2331
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2332
                $queryBuilder->expr()->eq('deleted', 0)
2333
            )
2334
            ->set('active', 0)
2335
            ->execute();
2336
        $queryBuilder->resetQueryPart('set');
2337
        $queryBuilder
2338
            ->update('tx_crawler_queue')
2339
            ->where(
2340
                $queryBuilder->expr()->eq('exec_time', 0),
2341
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2342
            )
2343
            ->set('process_scheduled', 0)
2344
            ->set('process_id', '')
2345
            ->execute();
2346
2347
        if (!$withinLock) {
2348
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2349
        }
2350
2351
        return true;
2352
    }
2353
2354
    /**
2355
     * Check if there are still resources left for the process with the given id
2356
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2357
     *
2358
     * @param  string  identification string for the process
2359
     * @return boolean determines if the process is still active / has resources
2360
     *
2361
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2362
     */
2363 1
    public function CLI_checkIfProcessIsActive($pid)
2364
    {
2365 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2366 1
        $ret = false;
2367
2368
        $statement = $queryBuilder
2369 1
            ->from('tx_crawler_process')
2370 1
            ->select('active')
2371 1
            ->where(
2372 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2373
            )
2374 1
            ->orderBy('ttl')
2375 1
            ->execute();
2376
2377 1
        if ($row = $statement->fetch(0)) {
2378 1
            $ret = intVal($row['active']) == 1;
2379
        }
2380
2381 1
        return $ret;
2382
    }
2383
2384
    /**
2385
     * Create a unique Id for the current process
2386
     *
2387
     * @return string  the ID
2388
     */
2389 2
    public function CLI_buildProcessId()
2390
    {
2391 2
        if (!$this->processID) {
2392 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2393
        }
2394 2
        return $this->processID;
2395
    }
2396
2397
    /**
2398
     * @param bool $get_as_float
2399
     *
2400
     * @return mixed
2401
     */
2402
    protected function microtime($get_as_float = false)
2403
    {
2404
        return microtime($get_as_float);
2405
    }
2406
2407
    /**
2408
     * Prints a message to the stdout (only if debug-mode is enabled)
2409
     *
2410
     * @param  string $msg  the message
2411
     */
2412
    public function CLI_debug($msg)
2413
    {
2414
        if (intval($this->extensionSettings['processDebug'])) {
2415
            echo $msg . "\n";
2416
            flush();
2417
        }
2418
    }
2419
2420
    /**
2421
     * Get URL content by making direct request to TYPO3.
2422
     *
2423
     * @param  string $url          Page URL
2424
     * @param  int    $crawlerId    Crawler-ID
2425
     * @return array
2426
     */
2427 2
    protected function sendDirectRequest($url, $crawlerId)
2428
    {
2429 2
        $parsedUrl = parse_url($url);
2430 2
        if (!is_array($parsedUrl)) {
2431
            return [];
2432
        }
2433
2434 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2435
2436 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2437 2
        $cmd .= ' ';
2438 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2439 2
        $cmd .= ' ';
2440 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2441 2
        $cmd .= ' ';
2442 2
        $cmd .= escapeshellarg($url);
2443 2
        $cmd .= ' ';
2444 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2445
2446 2
        $startTime = microtime(true);
2447 2
        $content = $this->executeShellCommand($cmd);
2448 2
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
2449
2450
        $result = [
2451 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2452 2
            'headers' => '',
2453 2
            'content' => $content
2454
        ];
2455
2456 2
        return $result;
2457
    }
2458
2459
    /**
2460
     * Cleans up entries that stayed for too long in the queue. These are:
2461
     * - processed entries that are over 1.5 days in age
2462
     * - scheduled entries that are over 7 days old
2463
     *
2464
     * @return void
2465
     */
2466
    public function cleanUpOldQueueEntries()
2467
    {
2468
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2469
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2470
2471
        $now = time();
2472
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2473
        $this->flushQueue($condition);
2474
    }
2475
2476
    /**
2477
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2478
     *
2479
     * @param int $pageId
2480
     * @return void
2481
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2482
     * @throws \TYPO3\CMS\Core\Http\ImmediateResponseException
2483
     */
2484
    protected function initTSFE(int $pageId): void
2485
    {
2486
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(
2487
            TypoScriptFrontendController::class,
2488
            null,
2489
            $pageId,
2490
            0
2491
        );
2492
        $GLOBALS['TSFE']->initFEuser();
2493
        $GLOBALS['TSFE']->determineId();
2494
        $GLOBALS['TSFE']->getConfigArray();
2495
        $GLOBALS['TSFE']->settingLanguage();
2496
        $GLOBALS['TSFE']->settingLocale();
2497
        $GLOBALS['TSFE']->newCObj();
2498
    }
2499
2500
    /**
2501
     * Returns a md5 hash generated from a serialized configuration array.
2502
     *
2503
     * @param array $configuration
2504
     *
2505
     * @return string
2506
     */
2507 7
    protected function getConfigurationHash(array $configuration)
2508
    {
2509 7
        unset($configuration['paramExpanded']);
2510 7
        unset($configuration['URLs']);
2511 7
        return md5(serialize($configuration));
2512
    }
2513
}
2514