Completed
Push — typo3v9 ( ea38b1...5ec12f )
by Tomas Norre
06:11
created

CrawlerController::getUrlsForPageId()   D

Complexity

Conditions 16
Paths 96

Size

Total Lines 93

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 47
CRAP Score 16.0552

Importance

Changes 0
Metric Value
cc 16
nc 96
nop 1
dl 0
loc 93
ccs 47
cts 50
cp 0.94
crap 16.0552
rs 4.606
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
29
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
30
use AOE\Crawler\Domain\Repository\ProcessRepository;
31
use AOE\Crawler\Domain\Repository\QueueRepository;
32
use AOE\Crawler\Event\EventDispatcher;
33
use AOE\Crawler\Utility\IconUtility;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use Psr\Http\Message\UriInterface;
36
use Psr\Log\LoggerAwareInterface;
37
use Psr\Log\LoggerAwareTrait;
38
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
39
use TYPO3\CMS\Backend\Utility\BackendUtility;
40
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
41
use TYPO3\CMS\Core\Core\Environment;
42
use TYPO3\CMS\Core\Database\Connection;
43
use TYPO3\CMS\Core\Database\ConnectionPool;
44
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
45
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
46
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
47
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
48
use TYPO3\CMS\Core\Http\Uri;
49
use TYPO3\CMS\Core\Routing\SiteMatcher;
50
use TYPO3\CMS\Core\Site\Entity\Site;
51
use TYPO3\CMS\Core\Site\SiteFinder;
52
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
53
use TYPO3\CMS\Core\Utility\DebugUtility;
54
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
55
use TYPO3\CMS\Core\Utility\GeneralUtility;
56
use TYPO3\CMS\Core\Utility\MathUtility;
57
use TYPO3\CMS\Extbase\Object\ObjectManager;
58
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
59
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
60
use TYPO3\CMS\Frontend\Page\PageRepository;
61
62
/**
63
 * Class CrawlerController
64
 *
65
 * @package AOE\Crawler\Controller
66
 */
67
class CrawlerController implements LoggerAwareInterface
68
{
69
    use LoggerAwareTrait;
70
71
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
72
    const CLI_STATUS_REMAIN = 1; //queue not empty
73
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
74
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
75
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
76
77
    /**
78
     * @var integer
79
     */
80
    public $setID = 0;
81
82
    /**
83
     * @var string
84
     */
85
    public $processID = '';
86
87
    /**
88
     * @var array
89
     */
90
    public $duplicateTrack = [];
91
92
    /**
93
     * @var array
94
     */
95
    public $downloadUrls = [];
96
97
    /**
98
     * @var array
99
     */
100
    public $incomingProcInstructions = [];
101
102
    /**
103
     * @var array
104
     */
105
    public $incomingConfigurationSelection = [];
106
107
    /**
108
     * @var bool
109
     */
110
    public $registerQueueEntriesInternallyOnly = false;
111
112
    /**
113
     * @var array
114
     */
115
    public $queueEntries = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $urlList = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $extensionSettings = [];
126
127
    /**
128
     * Mount Point
129
     *
130
     * @var boolean
131
     */
132
    public $MP = false;
133
134
    /**
135
     * @var string
136
     */
137
    protected $processFilename;
138
139
    /**
140
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
141
     *
142
     * @var string
143
     */
144
    protected $accessMode;
145
146
    /**
147
     * @var BackendUserAuthentication
148
     */
149
    private $backendUser;
150
151
    /**
152
     * @var integer
153
     */
154
    private $scheduledTime = 0;
155
156
    /**
157
     * @var integer
158
     */
159
    private $reqMinute = 0;
160
161
    /**
162
     * @var bool
163
     */
164
    private $submitCrawlUrls = false;
165
166
    /**
167
     * @var bool
168
     */
169
    private $downloadCrawlUrls = false;
170
171
    /**
172
     * @var QueueRepository
173
     */
174
    protected $queueRepository;
175
176
    /**
177
     * @var ProcessRepository
178
     */
179
    protected $processRepository;
180
181
    /**
182
     * @var ConfigurationRepository
183
     */
184
    protected $configurationRepository;
185
186
    /**
187
     * @var string
188
     */
189
    protected $tableName = 'tx_crawler_queue';
190
191
192
    /**
193
     * @var int
194
     */
195
    protected $maximumUrlsToCompile = 10000;
196
197
    /**
198
     * Method to set the accessMode can be gui, cli or cli_im
199
     *
200
     * @return string
201
     */
202 1
    public function getAccessMode()
203
    {
204 1
        return $this->accessMode;
205
    }
206
207
    /**
208
     * @param string $accessMode
209
     */
210 1
    public function setAccessMode($accessMode)
211
    {
212 1
        $this->accessMode = $accessMode;
213 1
    }
214
215
    /**
216
     * Set disabled status to prevent processes from being processed
217
     *
218
     * @param  bool $disabled (optional, defaults to true)
219
     * @return void
220
     */
221 3
    public function setDisabled($disabled = true)
222
    {
223 3
        if ($disabled) {
224 2
            GeneralUtility::writeFile($this->processFilename, '');
225
        } else {
226 1
            if (is_file($this->processFilename)) {
227 1
                unlink($this->processFilename);
228
            }
229
        }
230 3
    }
231
232
    /**
233
     * Get disable status
234
     *
235
     * @return bool true if disabled
236
     */
237 3
    public function getDisabled()
238
    {
239 3
        return is_file($this->processFilename);
240
    }
241
242
    /**
243
     * @param string $filenameWithPath
244
     *
245
     * @return void
246
     */
247 4
    public function setProcessFilename($filenameWithPath)
248
    {
249 4
        $this->processFilename = $filenameWithPath;
250 4
    }
251
252
    /**
253
     * @return string
254
     */
255 1
    public function getProcessFilename()
256
    {
257 1
        return $this->processFilename;
258
    }
259
260
    /************************************
261
     *
262
     * Getting URLs based on Page TSconfig
263
     *
264
     ************************************/
265
266 28
    public function __construct()
267
    {
268 28
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
269 28
        $this->queueRepository = $objectManager->get(QueueRepository::class);
270 28
        $this->processRepository = $objectManager->get(ProcessRepository::class);
271 28
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
272
273 28
        $this->backendUser = $GLOBALS['BE_USER'];
274 28
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
275
276
        /** @var ExtensionConfigurationProvider $configurationProvider */
277 28
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
278 28
        $settings = $configurationProvider->getExtensionConfiguration();
279 28
        $this->extensionSettings = is_array($settings) ? $settings : [];
280
281
        // set defaults:
282 28
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
283
            $this->extensionSettings['countInARun'] = 100;
284
        }
285
286 28
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
287 28
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
288 28
    }
289
290
    /**
291
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
292
     *
293
     * @param array $extensionSettings
294
     * @return void
295
     */
296 9
    public function setExtensionSettings(array $extensionSettings)
297
    {
298 9
        $this->extensionSettings = $extensionSettings;
299 9
    }
300
301
    /**
302
     * Check if the given page should be crawled
303
     *
304
     * @param array $pageRow
305
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
306
     */
307 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
308
    {
309 8
        $skipPage = false;
310 8
        $skipMessage = 'Skipped'; // message will be overwritten later
311
312
        // if page is hidden
313 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
314 8
            if ($pageRow['hidden']) {
315 1
                $skipPage = true;
316 1
                $skipMessage = 'Because page is hidden';
317
            }
318
        }
319
320 8
        if (!$skipPage) {
321 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
322 3
                $skipPage = true;
323 3
                $skipMessage = 'Because doktype is not allowed';
324
            }
325
        }
326
327 8
        if (!$skipPage) {
328 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
329 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
330 1
                    $skipPage = true;
331 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
332 1
                    break;
333
                }
334
            }
335
        }
336
337 8
        if (!$skipPage) {
338
            // veto hook
339 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
340
                $params = [
341
                    'pageRow' => $pageRow
342
                ];
343
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
344
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
345
                if ($veto !== false) {
346
                    $skipPage = true;
347
                    if (is_string($veto)) {
348
                        $skipMessage = $veto;
349
                    } else {
350
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
351
                    }
352
                    // no need to execute other hooks if a previous one return a veto
353
                    break;
354
                }
355
            }
356
        }
357
358 8
        return $skipPage ? $skipMessage : false;
359
    }
360
361
    /**
362
     * Wrapper method for getUrlsForPageId()
363
     * It returns an array of configurations and no urls!
364
     *
365
     * @param array $pageRow Page record with at least dok-type and uid columns.
366
     * @param string $skipMessage
367
     * @return array
368
     * @see getUrlsForPageId()
369
     */
370 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
371
    {
372 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
373
374 4
        if ($message === false) {
375 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
376 3
            $skipMessage = '';
377
        } else {
378 1
            $skipMessage = $message;
379 1
            $res = [];
380
        }
381
382 4
        return $res;
383
    }
384
385
    /**
386
     * This method is used to count if there are ANY unprocessed queue entries
387
     * of a given page_id and the configuration which matches a given hash.
388
     * If there if none, we can skip an inner detail check
389
     *
390
     * @param  int $uid
391
     * @param  string $configurationHash
392
     * @return boolean
393
     */
394 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
395
    {
396 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
397 5
        $noUnprocessedQueueEntriesFound = true;
398
399
        $result = $queryBuilder
400 5
            ->count('*')
401 5
            ->from($this->tableName)
402 5
            ->where(
403 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
404 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
405 5
                $queryBuilder->expr()->eq('exec_time', 0)
406
            )
407 5
            ->execute()
408 5
            ->fetchColumn();
409
410 5
        if ($result) {
411 3
            $noUnprocessedQueueEntriesFound = false;
412
        }
413
414 5
        return $noUnprocessedQueueEntriesFound;
415
    }
416
417
    /**
418
     * Creates a list of URLs from input array (and submits them to queue if asked for)
419
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
420
     *
421
     * @param    array        Information about URLs from pageRow to crawl.
422
     * @param    array        Page row
423
     * @param    integer        Unix time to schedule indexing to, typically time()
424
     * @param    integer        Number of requests per minute (creates the interleave between requests)
425
     * @param    boolean        If set, submits the URLs to queue
426
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
427
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
428
     * @param    array        Array which will be filled with URLS for download if flag is set.
429
     * @param    array        Array of processing instructions
430
     * @return    string        List of URLs (meant for display in backend module)
431
     *
432
     */
433 2
    public function urlListFromUrlArray(
434
        array $vv,
435
        array $pageRow,
436
        $scheduledTime,
437
        $reqMinute,
438
        $submitCrawlUrls,
439
        $downloadCrawlUrls,
440
        array &$duplicateTrack,
441
        array &$downloadUrls,
442
        array $incomingProcInstructions
443
    ) {
444
445 2
        if (!is_array($vv['URLs'])) {
446
            return 'ERROR - no URL generated';
447
        }
448 2
        $urlLog = [];
449 2
        $pageId = (int)$pageRow['uid'];
450 2
        $configurationHash = $this->getConfigurationHash($vv);
451 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
452
453 2
        foreach ($vv['URLs'] as $urlQuery) {
454 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
455
                continue;
456
            }
457 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
458 2
                $pageId,
459 2
                $urlQuery,
460 2
                $vv['subCfg']['baseUrl'] ?? null,
461 2
                $vv['subCfg']['force_ssl'] ?? 0
462
            );
463
464
            // Create key by which to determine unique-ness:
465 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
466
467 2
            if (isset($duplicateTrack[$uKey])) {
468
                //if the url key is registered just display it and do not resubmit is
469
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
470
            } else {
471
                // Scheduled time:
472 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
473 2
                $schTime = floor($schTime / 60) * 60;
474 2
                $formattedDate = BackendUtility::datetime($schTime);
475 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
476 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
477
478
                // Submit for crawling!
479 2
                if ($submitCrawlUrls) {
480 2
                    $added = $this->addUrl(
481 2
                        $pageId,
482 2
                        $url,
483 2
                        $vv['subCfg'],
484 2
                        $scheduledTime,
485 2
                        $configurationHash,
486 2
                        $skipInnerCheck
487
                    );
488 2
                    if ($added === false) {
489 2
                        $urlList .= ' (URL already existed)';
490
                    }
491
                } elseif ($downloadCrawlUrls) {
492
                    $downloadUrls[$url] = $url;
493
                }
494 2
                $urlLog[] = $urlList;
495
            }
496 2
            $duplicateTrack[$uKey] = true;
497
        }
498
499 2
        return implode('<br>', $urlLog);
500
    }
501
502
    /**
503
     * Returns true if input processing instruction is among registered ones.
504
     *
505
     * @param string $piString PI to test
506
     * @param array $incomingProcInstructions Processing instructions
507
     * @return boolean
508
     */
509 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
510
    {
511 5
        if (empty($incomingProcInstructions)) {
512 1
            return true;
513
        }
514
515 4
        foreach ($incomingProcInstructions as $pi) {
516 4
            if (GeneralUtility::inList($piString, $pi)) {
517 2
                return true;
518
            }
519
        }
520 2
        return false;
521
    }
522
523 2
    public function getPageTSconfigForId($id)
524
    {
525 2
        if (!$this->MP) {
526 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
527
        } else {
528
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
The variable $mountPointId does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
529
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
530
        }
531
532
        // Call a hook to alter configuration
533 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
534
            $params = [
535
                'pageId' => $id,
536
                'pageTSConfig' => &$pageTSconfig
537
            ];
538
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
539
                GeneralUtility::callUserFunction($userFunc, $params, $this);
540
            }
541
        }
542 2
        return $pageTSconfig;
543
    }
544
545
    /**
546
     * This methods returns an array of configurations.
547
     * And no urls!
548
     *
549
     * @param integer $id Page ID
0 ignored issues
show
Bug introduced by
There is no parameter named $id. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
550
     * @return array
551
     */
552 2
    public function getUrlsForPageId($pageId)
553
    {
554
        // Get page TSconfig for page ID
555 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
556
557 2
        $res = [];
558
559
        // Fetch Crawler Configuration from pageTSconfig
560 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
561 2
        foreach ($crawlerCfg as $key => $values) {
562 1
            if (!is_array($values)) {
563 1
                continue;
564
            }
565 1
            $key = str_replace('.', '', $key);
566
            // Sub configuration for a single configuration string:
567 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
568 1
            $subCfg['key'] = $key;
569
570 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
571 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
572
            }
573 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
574
575
            // process configuration if it is not page-specific or if the specific page is the current page:
576 1
            if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
577
578
                // Explode, process etc.:
579 1
                $res[$key] = [];
580 1
                $res[$key]['subCfg'] = $subCfg;
581 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
582 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
583 1
                $res[$key]['origin'] = 'pagets';
584
585
                // recognize MP value
586 1
                if (!$this->MP) {
587 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
588
                } else {
589
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
590
                }
591
            }
592
        }
593
594
        // Get configuration from tx_crawler_configuration records up the rootline
595 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
596 2
        foreach ($crawlerConfigurations as $configurationRecord) {
597
598
                // check access to the configuration record
599 1
            if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
600 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
601
602
                // process configuration if it is not page-specific or if the specific page is the current page:
603 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
604 1
                    $key = $configurationRecord['name'];
605
606
                    // don't overwrite previously defined paramSets
607 1
                    if (!isset($res[$key])) {
608
609
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
610 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
611 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
612
613
                        $subCfg = [
614 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
615 1
                            'procInstrParams.' => $TSparserObject->setup,
616 1
                            'baseUrl' => $configurationRecord['base_url'],
617 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
618 1
                            'userGroups' => $configurationRecord['fegroups'],
619 1
                            'exclude' => $configurationRecord['exclude'],
620 1
                            'rootTemplatePid' => (int)$configurationRecord['root_template_pid'],
621 1
                            'key' => $key
622
                        ];
623
624 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
625 1
                            $res[$key] = [];
626 1
                            $res[$key]['subCfg'] = $subCfg;
627 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
628 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
629 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
630 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
631
                        }
632
                    }
633
                }
634
            }
635
        }
636
637 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
638
            $params = [
639
                'res' => &$res,
640
            ];
641
            GeneralUtility::callUserFunction($func, $params, $this);
642
        }
643 2
        return $res;
644
    }
645
646
    /**
647
     * Find all configurations of subpages of a page
648
     *
649
     * @param int $rootid
650
     * @param $depth
651
     * @return array
652
     *
653
     * TODO: Write Functional Tests
654
     */
655
    public function getConfigurationsForBranch(int $rootid, $depth)
656
    {
657
        $configurationsForBranch = [];
658
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
659
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
660
        foreach ($sets as $key => $value) {
661
            if (!is_array($value)) {
662
                continue;
663
            }
664
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
665
        }
666
        $pids = [];
667
        $rootLine = BackendUtility::BEgetRootLine($rootid);
668
        foreach ($rootLine as $node) {
669
            $pids[] = $node['uid'];
670
        }
671
        /* @var PageTreeView $tree */
672
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
673
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
674
        $tree->init('AND ' . $perms_clause);
675
        $tree->getTree($rootid, $depth, '');
676
        foreach ($tree->tree as $node) {
677
            $pids[] = $node['row']['uid'];
678
        }
679
680
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
681
682
        $queryBuilder->getRestrictions()
683
            ->removeAll()
684
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
685
686
        $statement = $queryBuilder
687
            ->select('name')
688
            ->from('tx_crawler_configuration')
689
            ->where(
690
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
691
            )
692
            ->execute();
693
694
        while ($row = $statement->fetch()) {
695
            $configurationsForBranch[] = $row['name'];
696
        }
697
        return $configurationsForBranch;
698
    }
699
700
    /**
701
     * Get querybuilder for given table
702
     *
703
     * @param string $table
704
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
705
     */
706 9
    private function getQueryBuilder(string $table)
707
    {
708 9
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
709
    }
710
711
    /**
712
     * Check if a user has access to an item
713
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
714
     *
715
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
716
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
717
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
718
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
719
     */
720 3
    public function hasGroupAccess($groupList, $accessList)
721
    {
722 3
        if (empty($accessList)) {
723 1
            return true;
724
        }
725 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
726 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
727 1
                return true;
728
            }
729
        }
730 1
        return false;
731
    }
732
733
    /**
734
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
735
     * Syntax of values:
736
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
737
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
738
     * - For each configuration part:
739
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
740
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
741
     *        _ENABLELANG:1 picks only original records without their language overlays
742
     *         - Default: Literal value
743
     *
744
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
745
     * @param integer $pid Current page ID
746
     * @return array
747
     *
748
     * TODO: Write Functional Tests
749
     */
750 2
    public function expandParameters($paramArray, $pid)
751
    {
752
        // Traverse parameter names:
753 2
        foreach ($paramArray as $p => $v) {
754 2
            $v = trim($v);
755
756
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
757 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
758
                // So, find the value inside brackets and reset the paramArray value as an array.
759 2
                $v = substr($v, 1, -1);
760 2
                $paramArray[$p] = [];
761
762
                // Explode parts and traverse them:
763 2
                $parts = explode('|', $v);
764 2
                foreach ($parts as $pV) {
765
766
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
767 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
768
769
                        // Swap if first is larger than last:
770
                        if ($reg[1] > $reg[2]) {
771
                            $temp = $reg[2];
772
                            $reg[2] = $reg[1];
773
                            $reg[1] = $temp;
774
                        }
775
776
                        // Traverse range, add values:
777
                        $runAwayBrake = 1000; // Limit to size of range!
778
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
779
                            $paramArray[$p][] = $a;
780
                            $runAwayBrake--;
781
                            if ($runAwayBrake <= 0) {
782
                                break;
783
                            }
784
                        }
785 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
786
787
                        // Parse parameters:
788
                        $subparts = GeneralUtility::trimExplode(';', $pV);
789
                        $subpartParams = [];
790
                        foreach ($subparts as $spV) {
791
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
792
                            $subpartParams[$pKey] = $pVal;
793
                        }
794
795
                        // Table exists:
796
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
797
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
798
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
799
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
800
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
801
802
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
803
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
804
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
805
806
                                $queryBuilder->getRestrictions()
807
                                    ->removeAll()
808
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
809
810
                                $queryBuilder
811
                                    ->select($fieldName)
812
                                    ->from($subpartParams['_TABLE'])
813
                                    // TODO: Check if this works as intended!
814
                                    ->add('from', $addTable)
815
                                    ->where(
816
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
817
                                        $where
818
                                    );
819
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
820
821
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
822
                                    $queryBuilder->andWhere(
823
                                        $queryBuilder->expr()->lte(
824
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
825
                                            0
826
                                        )
827
                                    );
828
                                }
829
830
                                $statement = $queryBuilder->execute();
831
832
                                $rows = [];
833
                                while ($row = $statement->fetch()) {
834
                                    $rows[$fieldName] = $row;
835
                                }
836
837
                                if (is_array($rows)) {
838
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
839
                                }
840
                            }
841
                        }
842
                    } else { // Just add value:
843 2
                        $paramArray[$p][] = $pV;
844
                    }
845
                    // Hook for processing own expandParameters place holder
846 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
847
                        $_params = [
848
                            'pObj' => &$this,
849
                            'paramArray' => &$paramArray,
850
                            'currentKey' => $p,
851
                            'currentValue' => $pV,
852
                            'pid' => $pid
853
                        ];
854
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
855
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
856
                        }
857
                    }
858
                }
859
860
                // Make unique set of values and sort array by key:
861 2
                $paramArray[$p] = array_unique($paramArray[$p]);
862 2
                ksort($paramArray);
863
            } else {
864
                // Set the literal value as only value in array:
865 2
                $paramArray[$p] = [$v];
866
            }
867
        }
868
869 2
        return $paramArray;
870
    }
871
872
    /**
873
     * Compiling URLs from parameter array (output of expandParameters())
874
     * The number of URLs will be the multiplication of the number of parameter values for each key
875
     *
876
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
877
     * @param array $urls URLs accumulated in this array (for recursion)
878
     * @return array
879
     */
880 5
    public function compileUrls($paramArray, array $urls)
881
    {
882 5
        if (empty($paramArray)) {
883 5
            return $urls;
884
        }
885
        // shift first off stack:
886 4
        reset($paramArray);
887 4
        $varName = key($paramArray);
888 4
        $valueSet = array_shift($paramArray);
889
890
        // Traverse value set:
891 4
        $newUrls = [];
892 4
        foreach ($urls as $url) {
893 3
            foreach ($valueSet as $val) {
894 3
                $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
895
896 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
897
                    break;
898
                }
899
            }
900
        }
901 4
        return $this->compileUrls($paramArray, $newUrls);
902
    }
903
904
    /************************************
905
     *
906
     * Crawler log
907
     *
908
     ************************************/
909
910
    /**
911
     * Return array of records from crawler queue for input page ID
912
     *
913
     * @param integer $id Page ID for which to look up log entries.
914
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
915
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
916
     * @param boolean $doFullFlush
917
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
918
     * @return array
919
     */
920 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
921
    {
922 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
923
        $queryBuilder
924 4
            ->select('*')
925 4
            ->from($this->tableName)
926 4
            ->where(
927 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
928
            )
929 4
            ->orderBy('scheduled', 'DESC');
930
931 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
932 4
            ->getConnectionForTable($this->tableName)
933 4
            ->getExpressionBuilder();
934 4
        $query = $expressionBuilder->andX();
935
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
936
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
937
        // between the statements, it's not a mistake in the code.
938 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
939 4
        switch ($filter) {
940 4
            case 'pending':
941
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
942
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
943
                break;
944 4
            case 'finished':
945
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
946
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
947
                break;
948
        }
949
950
        // FIXME: Write unit test that ensures that the right records are deleted.
951 4
        if ($doFlush) {
952 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
953 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
954 2
            return [];
955
        } else {
956 2
            if ($itemsPerPage > 0) {
957
                $queryBuilder
958 2
                    ->setMaxResults((int)$itemsPerPage);
959
            }
960
961 2
            return $queryBuilder->execute()->fetchAll();
962
        }
963
    }
964
965
    /**
966
     * Return array of records from crawler queue for input set ID
967
     *
968
     * @param integer $set_id Set ID for which to look up log entries.
969
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
970
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
971
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
972
     * @return array
973
     */
974 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
975
    {
976 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
977
        $queryBuilder
978 6
            ->select('*')
979 6
            ->from($this->tableName)
980 6
            ->where(
981 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
982
            )
983 6
            ->orderBy('scheduled', 'DESC');
984
985 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
986 6
            ->getConnectionForTable($this->tableName)
987 6
            ->getExpressionBuilder();
988 6
        $query = $expressionBuilder->andX();
989
        // FIXME: Write Unit tests for Filters
990
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
991
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
992
        // between the statements, it's not a mistake in the code.
993 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
994 6
        switch ($filter) {
995 6
            case 'pending':
996 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
997 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
998 1
                break;
999 5
            case 'finished':
1000 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1001 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1002 1
                break;
1003
        }
1004
        // FIXME: Write unit test that ensures that the right records are deleted.
1005 6
        if ($doFlush) {
1006 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1007 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1008 4
            return [];
1009
        } else {
1010 2
            if ($itemsPerPage > 0) {
1011
                $queryBuilder
1012 2
                    ->setMaxResults((int)$itemsPerPage);
1013
            }
1014
1015 2
            return $queryBuilder->execute()->fetchAll();
1016
        }
1017
    }
1018
1019
    /**
1020
     * Removes queue entries
1021
     *
1022
     * @param string $where SQL related filter for the entries which should be removed
1023
     * @return void
1024
     */
1025 9
    protected function flushQueue($where = '')
1026
    {
1027 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1028
1029 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1030
1031 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1032
            $groups = $queryBuilder
1033
                ->select('DISTINCT set_id')
1034
                ->from($this->tableName)
1035
                ->where($realWhere)
1036
                ->execute()
1037
                ->fetchAll();
1038
            if (is_array($groups)) {
1039
                foreach ($groups as $group) {
1040
                    $subSet = $queryBuilder
1041
                        ->select('uid', 'set_id')
1042
                        ->from($this->tableName)
1043
                        ->where(
1044
                            $realWhere,
1045
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1046
                        )
1047
                        ->execute()
1048
                        ->fetchAll();
1049
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1050
                }
1051
            }
1052
        }
1053
1054
        $queryBuilder
1055 9
            ->delete($this->tableName)
1056 9
            ->where($realWhere)
1057 9
            ->execute();
1058 9
    }
1059
1060
    /**
1061
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1062
     *
1063
     * @param integer $setId Set ID
1064
     * @param array $params Parameters to pass to call back function
1065
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1066
     * @param integer $page_id Page ID to attach it to
1067
     * @param integer $schedule Time at which to activate
1068
     * @return void
1069
     */
1070
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1071
    {
1072
        if (!is_array($params)) {
1073
            $params = [];
1074
        }
1075
        $params['_CALLBACKOBJ'] = $callBack;
1076
1077
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1078
            ->insert(
1079
                'tx_crawler_queue',
1080
                [
1081
                    'page_id' => intval($page_id),
1082
                    'parameters' => serialize($params),
1083
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1084
                    'exec_time' => 0,
1085
                    'set_id' => intval($setId),
1086
                    'result_data' => '',
1087
                ]
1088
            );
1089
    }
1090
1091
    /************************************
1092
     *
1093
     * URL setting
1094
     *
1095
     ************************************/
1096
1097
    /**
1098
     * Setting a URL for crawling:
1099
     *
1100
     * @param integer $id Page ID
1101
     * @param string $url Complete URL
1102
     * @param array $subCfg Sub configuration array (from TS config)
1103
     * @param integer $tstamp Scheduled-time
1104
     * @param string $configurationHash (optional) configuration hash
1105
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1106
     * @return bool
1107
     */
1108 2
    public function addUrl(
1109
        $id,
1110
        $url,
1111
        array $subCfg,
1112
        $tstamp,
1113
        $configurationHash = '',
1114
        $skipInnerDuplicationCheck = false
1115
    ) {
1116 2
        $urlAdded = false;
1117 2
        $rows = [];
1118
1119
        // Creating parameters:
1120
        $parameters = [
1121 2
            'url' => $url
1122
        ];
1123
1124
        // fe user group simulation:
1125 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1126 2
        if ($uGs) {
1127
            $parameters['feUserGroupList'] = $uGs;
1128
        }
1129
1130
        // Setting processing instructions
1131 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1132 2
        if (is_array($subCfg['procInstrParams.'])) {
1133 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1134
        }
1135
1136
        // Possible TypoScript Template Parents
1137 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1138
1139
        // Compile value array:
1140 2
        $parameters_serialized = serialize($parameters);
1141
        $fieldArray = [
1142 2
            'page_id' => (int)$id,
1143 2
            'parameters' => $parameters_serialized,
1144 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1145 2
            'configuration_hash' => $configurationHash,
1146 2
            'scheduled' => $tstamp,
1147 2
            'exec_time' => 0,
1148 2
            'set_id' => intval($this->setID),
1149 2
            'result_data' => '',
1150 2
            'configuration' => $subCfg['key'],
1151
        ];
1152
1153 2
        if ($this->registerQueueEntriesInternallyOnly) {
1154
            //the entries will only be registered and not stored to the database
1155
            $this->queueEntries[] = $fieldArray;
1156
        } else {
1157 2
            if (!$skipInnerDuplicationCheck) {
1158
                // check if there is already an equal entry
1159 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1160
            }
1161
1162 2
            if (empty($rows)) {
1163 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1164 2
                $connectionForCrawlerQueue->insert(
1165 2
                    'tx_crawler_queue',
1166 2
                    $fieldArray
1167
                );
1168 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1169 2
                $rows[] = $uid;
1170 2
                $urlAdded = true;
1171 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1172
            } else {
1173
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1174
            }
1175
        }
1176
1177 2
        return $urlAdded;
1178
    }
1179
1180
    /**
1181
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1182
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1183
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1184
     *
1185
     * @param int $tstamp
1186
     * @param array $fieldArray
1187
     *
1188
     * @return array
1189
     *
1190
     * TODO: Write Functional Tests
1191
     */
1192 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1193
    {
1194 2
        $rows = [];
1195
1196 2
        $currentTime = $this->getCurrentTime();
1197
1198 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1199
        $queryBuilder
1200 2
            ->select('qid')
1201 2
            ->from('tx_crawler_queue');
1202
        //if this entry is scheduled with "now"
1203 2
        if ($tstamp <= $currentTime) {
1204
            if ($this->extensionSettings['enableTimeslot']) {
1205
                $timeBegin = $currentTime - 100;
1206
                $timeEnd = $currentTime + 100;
1207
                $queryBuilder
1208
                    ->where(
1209
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1210
                    )
1211
                    ->orWhere(
1212
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1213
                    );
1214
            } else {
1215
                $queryBuilder
1216
                    ->where(
1217
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1218
                    );
1219
            }
1220 2
        } elseif ($tstamp > $currentTime) {
1221
            //entry with a timestamp in the future need to have the same schedule time
1222
            $queryBuilder
1223 2
                ->where(
1224 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1225
                );
1226
        }
1227
1228
        $statement = $queryBuilder
1229 2
            ->andWhere('exec_time != 0')
1230 2
            ->andWhere('process_id != 0')
1231 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1232 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1233 2
            ->execute();
1234
1235 2
        while ($row = $statement->fetch()) {
1236
            $rows[] = $row['qid'];
1237
        }
1238
1239 2
        return $rows;
1240
    }
1241
1242
    /**
1243
     * Returns the current system time
1244
     *
1245
     * @return int
1246
     */
1247
    public function getCurrentTime()
1248
    {
1249
        return time();
1250
    }
1251
1252
    /************************************
1253
     *
1254
     * URL reading
1255
     *
1256
     ************************************/
1257
1258
    /**
1259
     * Read URL for single queue entry
1260
     *
1261
     * @param integer $queueId
1262
     * @param boolean $force If set, will process even if exec_time has been set!
1263
     * @return integer
1264
     */
1265
    public function readUrl($queueId, $force = false)
1266
    {
1267
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1268
        $ret = 0;
1269
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1270
        // Get entry:
1271
        $queryBuilder
1272
            ->select('*')
1273
            ->from('tx_crawler_queue')
1274
            ->where(
1275
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1276
            );
1277
        if (!$force) {
1278
            $queryBuilder
1279
                ->andWhere('exec_time = 0')
1280
                ->andWhere('process_scheduled > 0');
1281
        }
1282
        $queueRec = $queryBuilder->execute()->fetch();
1283
1284
        if (!is_array($queueRec)) {
1285
            return;
1286
        }
1287
1288
        $parameters = unserialize($queueRec['parameters']);
1289
        if ($parameters['rootTemplatePid']) {
1290
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1291
        } else {
1292
            $this->logger->warning(
1293
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1294
            );
1295
        }
1296
1297
        SignalSlotUtility::emitSignal(
1298
            __CLASS__,
1299
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1300
            [$queueId, &$queueRec]
1301
        );
1302
1303
        // Set exec_time to lock record:
1304
        $field_array = ['exec_time' => $this->getCurrentTime()];
1305
1306
        if (isset($this->processID)) {
1307
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1308
            $field_array['process_id_completed'] = $this->processID;
1309
        }
1310
1311
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1312
            ->update(
1313
                'tx_crawler_queue',
1314
                $field_array,
1315
                [ 'qid' => (int)$queueId ]
1316
            );
1317
1318
        $result = $this->readUrl_exec($queueRec);
1319
        $resultData = unserialize($result['content']);
1320
1321
        //atm there's no need to point to specific pollable extensions
1322
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1323
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1324
                // only check the success value if the instruction is runnig
1325
                // it is important to name the pollSuccess key same as the procInstructions key
1326
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1327
                    $pollable,
1328
                    $resultData['parameters']['procInstructions']
1329
                )
1330
                ) {
1331
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1332
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1333
                    }
1334
                }
1335
            }
1336
        }
1337
1338
        // Set result in log which also denotes the end of the processing of this entry.
1339
        $field_array = ['result_data' => serialize($result)];
1340
1341
        SignalSlotUtility::emitSignal(
1342
            __CLASS__,
1343
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1344
            [$queueId, &$field_array]
1345
        );
1346
1347
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1348
            ->update(
1349
                'tx_crawler_queue',
1350
                $field_array,
1351
                [ 'qid' => (int)$queueId ]
1352
            );
1353
1354
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1355
        return $ret;
1356
    }
1357
1358
    /**
1359
     * Read URL for not-yet-inserted log-entry
1360
     *
1361
     * @param array $field_array Queue field array,
1362
     *
1363
     * @return string
1364
     */
1365
    public function readUrlFromArray($field_array)
1366
    {
1367
1368
            // Set exec_time to lock record:
1369
        $field_array['exec_time'] = $this->getCurrentTime();
1370
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1371
        $connectionForCrawlerQueue->insert(
1372
            'tx_crawler_queue',
1373
            $field_array
1374
        );
1375
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1376
1377
        $result = $this->readUrl_exec($field_array);
1378
1379
        // Set result in log which also denotes the end of the processing of this entry.
1380
        $field_array = ['result_data' => serialize($result)];
1381
1382
        SignalSlotUtility::emitSignal(
1383
            __CLASS__,
1384
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1385
            [$queueId, &$field_array]
1386
        );
1387
1388
        $connectionForCrawlerQueue->update(
1389
            'tx_crawler_queue',
1390
            $field_array,
1391
            ['qid' => $queueId]
1392
        );
1393
1394
        return $result;
1395
    }
1396
1397
    /**
1398
     * Read URL for a queue record
1399
     *
1400
     * @param array $queueRec Queue record
1401
     * @return string
1402
     */
1403
    public function readUrl_exec($queueRec)
1404
    {
1405
        // Decode parameters:
1406
        $parameters = unserialize($queueRec['parameters']);
1407
        $result = 'ERROR';
1408
        if (is_array($parameters)) {
1409
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1410
                $objRef = $parameters['_CALLBACKOBJ'];
1411
                $callBackObj = GeneralUtility::makeInstance($objRef);
1412
                if (is_object($callBackObj)) {
1413
                    unset($parameters['_CALLBACKOBJ']);
1414
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1415
                } else {
1416
                    $result = ['content' => 'No object: ' . $objRef];
1417
                }
1418
            } else { // Regular FE request:
1419
1420
                // Prepare:
1421
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1422
1423
                // Get result:
1424
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1425
1426
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1427
            }
1428
        }
1429
1430
        return $result;
1431
    }
1432
1433
    /**
1434
     * Gets the content of a URL.
1435
     *
1436
     * @param string $originalUrl URL to read
1437
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1438
     * @param integer $timeout Timeout time
1439
     * @param integer $recursion Recursion limiter for 302 redirects
1440
     * @return array|boolean
1441
     */
1442 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1443
    {
1444 2
        if (!$recursion) {
1445
            return false;
1446
        }
1447
1448
        // Parse URL, checking for scheme:
1449 2
        $url = parse_url($originalUrl);
1450
1451 2
        if ($url === false) {
1452
            $this->logger->debug(
1453
                sprintf('Could not parse_url() for string "%s"', $url),
1454
                ['crawlerId' => $crawlerId]
1455
            );
1456
            return false;
1457
        }
1458
1459 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1460
            $this->logger->debug(
1461
                sprintf('Scheme does not match for url "%s"', $url),
1462
                ['crawlerId' => $crawlerId]
1463
            );
1464
            return false;
1465
        }
1466
1467
        // direct request
1468 2
        if ($this->extensionSettings['makeDirectRequests']) {
1469 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1470 2
            return $result;
1471
        }
1472
1473
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1474
1475
        // thanks to Pierrick Caillon for adding proxy support
1476
        $rurl = $url;
1477
1478
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1479
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1480
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1481
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1482
        }
1483
1484
        $host = $rurl['host'];
1485
1486
        if ($url['scheme'] == 'https') {
1487
            $host = 'ssl://' . $host;
1488
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1489
        } else {
1490
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1491
        }
1492
1493
        $startTime = microtime(true);
1494
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1495
1496
        if (!$fp) {
1497
            $this->logger->debug(
1498
                sprintf('Error while opening "%s"', $url),
1499
                ['crawlerId' => $crawlerId]
1500
            );
1501
            return false;
1502
        } else {
1503
            // Request message:
1504
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1505
            fputs($fp, $msg);
1506
1507
            // Read response:
1508
            $d = $this->getHttpResponseFromStream($fp);
1509
            fclose($fp);
1510
1511
            $time = microtime(true) - $startTime;
1512
            $this->logger->info($originalUrl . ' ' . $time);
1513
1514
            // Implode content and headers:
1515
            $result = [
1516
                'request' => $msg,
1517
                'headers' => implode('', $d['headers']),
1518
                'content' => implode('', (array)$d['content'])
1519
            ];
1520
1521
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1522
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1523
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1524
1525
                if (is_array($newRequestUrl)) {
1526
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1527
                } else {
1528
                    $this->logger->debug(
1529
                        sprintf('Error while opening "%s"', $url),
1530
                        ['crawlerId' => $crawlerId]
1531
                    );
1532
                    return false;
1533
                }
1534
            }
1535
1536
            return $result;
1537
        }
1538
    }
1539
1540
    /**
1541
     * Gets the base path of the website frontend.
1542
     * (e.g. if you call http://mydomain.com/cms/index.php in
1543
     * the browser the base path is "/cms/")
1544
     *
1545
     * @return string Base path of the website frontend
1546
     */
1547
    protected function getFrontendBasePath()
1548
    {
1549
        $frontendBasePath = '/';
1550
1551
        // Get the path from the extension settings:
1552
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1553
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1554
        // If empty, try to use config.absRefPrefix:
1555
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1556
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1557
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1558
        } elseif (!Environment::isCli()) {
1559
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1560
        }
1561
1562
        // Base path must be '/<pathSegements>/':
1563
        if ($frontendBasePath !== '/') {
1564
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1565
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1566
        }
1567
1568
        return $frontendBasePath;
1569
    }
1570
1571
    /**
1572
     * Executes a shell command and returns the outputted result.
1573
     *
1574
     * @param string $command Shell command to be executed
1575
     * @return string Outputted result of the command execution
1576
     */
1577
    protected function executeShellCommand($command)
1578
    {
1579
        return shell_exec($command);
1580
    }
1581
1582
    /**
1583
     * Reads HTTP response from the given stream.
1584
     *
1585
     * @param  resource $streamPointer  Pointer to connection stream.
1586
     * @return array                    Associative array with the following items:
1587
     *                                  headers <array> Response headers sent by server.
1588
     *                                  content <array> Content, with each line as an array item.
1589
     */
1590 1
    protected function getHttpResponseFromStream($streamPointer)
1591
    {
1592 1
        $response = ['headers' => [], 'content' => []];
1593
1594 1
        if (is_resource($streamPointer)) {
1595
            // read headers
1596 1
            while ($line = fgets($streamPointer, '2048')) {
1597 1
                $line = trim($line);
1598 1
                if ($line !== '') {
1599 1
                    $response['headers'][] = $line;
1600
                } else {
1601 1
                    break;
1602
                }
1603
            }
1604
1605
            // read content
1606 1
            while ($line = fgets($streamPointer, '2048')) {
1607 1
                $response['content'][] = $line;
1608
            }
1609
        }
1610
1611 1
        return $response;
1612
    }
1613
1614
    /**
1615
     * Builds HTTP request headers.
1616
     *
1617
     * @param array $url
1618
     * @param string $crawlerId
1619
     *
1620
     * @return array
1621
     */
1622 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1623
    {
1624 6
        $reqHeaders = [];
1625 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1626 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1627 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1628 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1629
        }
1630 6
        $reqHeaders[] = 'Connection: close';
1631 6
        if ($url['user'] != '') {
1632 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1633
        }
1634 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1635 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1636 6
        return $reqHeaders;
1637
    }
1638
1639
    /**
1640
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1641
     *
1642
     * @param array $headers HTTP Header
1643
     * @param string $user HTTP Auth. User
1644
     * @param string $pass HTTP Auth. Password
1645
     * @return bool|string
1646
     */
1647 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1648
    {
1649 12
        $header = [];
1650 12
        if (!is_array($headers)) {
1651 1
            return false;
1652
        }
1653 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1654 2
            return false;
1655
        }
1656
1657 9
        foreach ($headers as $hl) {
1658 9
            $tmp = explode(": ", $hl);
1659 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1660 9
            if (trim($tmp[0]) == 'Location') {
1661 6
                break;
1662
            }
1663
        }
1664 9
        if (!array_key_exists('Location', $header)) {
1665 3
            return false;
1666
        }
1667
1668 6
        if ($user != '') {
1669 3
            if (!($tmp = parse_url($header['Location']))) {
1670 1
                return false;
1671
            }
1672 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1673 2
            if ($tmp['query'] != '') {
1674 2
                $newUrl .= '?' . $tmp['query'];
1675
            }
1676
        } else {
1677 3
            $newUrl = $header['Location'];
1678
        }
1679 5
        return $newUrl;
1680
    }
1681
1682
    /**************************
1683
     *
1684
     * tslib_fe hooks:
1685
     *
1686
     **************************/
1687
1688
    /**
1689
     * Initialization hook (called after database connection)
1690
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1691
     *
1692
     * @param array $params Parameters from frontend
1693
     * @param object $ref TSFE object (reference under PHP5)
1694
     * @return void
1695
     *
1696
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1697
     * FIXME: I think this can be removed. (TNM)
1698
     */
1699
    public function fe_init(&$params, $ref)
1700
    {
1701
        // Authenticate crawler request:
1702
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1703
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1704
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1705
1706
            $queueRec = $queryBuilder
1707
                ->select('*')
1708
                ->from('tx_crawler_queue')
1709
                ->where(
1710
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1711
                )
1712
                ->execute()
1713
                ->fetch();
1714
1715
            // If a crawler record was found and hash was matching, set it up:
1716
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1717
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1718
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1719
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1720
            } else {
1721
                die('No crawler entry found!');
1722
            }
1723
        }
1724
    }
1725
1726
    /*****************************
1727
     *
1728
     * Compiling URLs to crawl - tools
1729
     *
1730
     *****************************/
1731
1732
    /**
1733
     * @param integer $id Root page id to start from.
1734
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1735
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1736
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1737
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1738
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1739
     * @param array $incomingProcInstructions Array of processing instructions
1740
     * @param array $configurationSelection Array of configuration keys
1741
     * @return string
1742
     */
1743
    public function getPageTreeAndUrls(
1744
        $id,
1745
        $depth,
1746
        $scheduledTime,
1747
        $reqMinute,
1748
        $submitCrawlUrls,
1749
        $downloadCrawlUrls,
1750
        array $incomingProcInstructions,
1751
        array $configurationSelection
1752
    ) {
1753
        $this->scheduledTime = $scheduledTime;
1754
        $this->reqMinute = $reqMinute;
1755
        $this->submitCrawlUrls = $submitCrawlUrls;
1756
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1757
        $this->incomingProcInstructions = $incomingProcInstructions;
1758
        $this->incomingConfigurationSelection = $configurationSelection;
1759
1760
        $this->duplicateTrack = [];
1761
        $this->downloadUrls = [];
1762
1763
        // Drawing tree:
1764
        /* @var PageTreeView $tree */
1765
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1766
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1767
        $tree->init('AND ' . $perms_clause);
1768
1769
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1770
        if (is_array($pageInfo)) {
1771
            // Set root row:
1772
            $tree->tree[] = [
1773
                'row' => $pageInfo,
1774
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1775
            ];
1776
        }
1777
1778
        // Get branch beneath:
1779
        if ($depth) {
1780
            $tree->getTree($id, $depth, '');
1781
        }
1782
1783
        // Traverse page tree:
1784
        $code = '';
1785
1786
        foreach ($tree->tree as $data) {
1787
            $this->MP = false;
1788
1789
            // recognize mount points
1790
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1791
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1792
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1793
                $mountpage = $queryBuilder
1794
                    ->select('*')
1795
                    ->from('pages')
1796
                    ->where(
1797
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1798
                    )
1799
                    ->execute()
1800
                    ->fetchAll();
1801
                $queryBuilder->getRestrictions()->reset();
1802
1803
                // fetch mounted pages
1804
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1805
1806
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1807
                $mountTree->init('AND ' . $perms_clause);
1808
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1809
1810
                foreach ($mountTree->tree as $mountData) {
1811
                    $code .= $this->drawURLs_addRowsForPage(
1812
                        $mountData['row'],
1813
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1814
                    );
1815
                }
1816
1817
                // replace page when mount_pid_ol is enabled
1818
                if ($mountpage[0]['mount_pid_ol']) {
1819
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1820
                } else {
1821
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1822
                    $this->MP = false;
1823
                }
1824
            }
1825
1826
            $code .= $this->drawURLs_addRowsForPage(
1827
                $data['row'],
1828
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1829
            );
1830
        }
1831
1832
        return $code;
1833
    }
1834
1835
    /**
1836
     * Expands exclude string
1837
     *
1838
     * @param string $excludeString Exclude string
1839
     * @return array
1840
     */
1841 1
    public function expandExcludeString($excludeString)
1842
    {
1843
        // internal static caches;
1844 1
        static $expandedExcludeStringCache;
1845 1
        static $treeCache;
1846
1847 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1848 1
            $pidList = [];
1849
1850 1
            if (!empty($excludeString)) {
1851
                /** @var PageTreeView $tree */
1852
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1853
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1854
1855
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1856
1857
                foreach ($excludeParts as $excludePart) {
1858
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1859
1860
                    // default is "page only" = "depth=0"
1861
                    if (empty($depth)) {
1862
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1863
                    }
1864
1865
                    $pidList[] = $pid;
1866
1867
                    if ($depth > 0) {
1868
                        if (empty($treeCache[$pid][$depth])) {
1869
                            $tree->reset();
1870
                            $tree->getTree($pid, $depth);
1871
                            $treeCache[$pid][$depth] = $tree->tree;
1872
                        }
1873
1874
                        foreach ($treeCache[$pid][$depth] as $data) {
1875
                            $pidList[] = $data['row']['uid'];
1876
                        }
1877
                    }
1878
                }
1879
            }
1880
1881 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1882
        }
1883
1884 1
        return $expandedExcludeStringCache[$excludeString];
1885
    }
1886
1887
    /**
1888
     * Create the rows for display of the page tree
1889
     * For each page a number of rows are shown displaying GET variable configuration
1890
     *
1891
     * @param    array        Page row
1892
     * @param    string        Page icon and title for row
1893
     * @return    string        HTML <tr> content (one or more)
1894
     */
1895
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1896
    {
1897
        $skipMessage = '';
1898
1899
        // Get list of configurations
1900
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1901
1902
        if (!empty($this->incomingConfigurationSelection)) {
1903
            // remove configuration that does not match the current selection
1904
            foreach ($configurations as $confKey => $confArray) {
1905
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1906
                    unset($configurations[$confKey]);
1907
                }
1908
            }
1909
        }
1910
1911
        // Traverse parameter combinations:
1912
        $c = 0;
1913
        $content = '';
1914
        if (!empty($configurations)) {
1915
            foreach ($configurations as $confKey => $confArray) {
1916
1917
                    // Title column:
1918
                if (!$c) {
1919
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1920
                } else {
1921
                    $titleClm = '';
1922
                }
1923
1924
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1925
1926
                        // URL list:
1927
                    $urlList = $this->urlListFromUrlArray(
1928
                        $confArray,
1929
                        $pageRow,
1930
                        $this->scheduledTime,
1931
                        $this->reqMinute,
1932
                        $this->submitCrawlUrls,
1933
                        $this->downloadCrawlUrls,
1934
                        $this->duplicateTrack,
1935
                        $this->downloadUrls,
1936
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1937
                    );
1938
1939
                    // Expanded parameters:
1940
                    $paramExpanded = '';
1941
                    $calcAccu = [];
1942
                    $calcRes = 1;
1943
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1944
                        $paramExpanded .= '
1945
                            <tr>
1946
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1947
                                                '(' . count($gVal) . ')' .
1948
                                                '</td>
1949
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1950
                            </tr>
1951
                        ';
1952
                        $calcRes *= count($gVal);
1953
                        $calcAccu[] = count($gVal);
1954
                    }
1955
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1956
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1957
1958
                    // Options
1959
                    $optionValues = '';
1960
                    if ($confArray['subCfg']['userGroups']) {
1961
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1962
                    }
1963
                    if ($confArray['subCfg']['procInstrFilter']) {
1964
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1965
                    }
1966
1967
                    // Compile row:
1968
                    $content .= '
1969
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1970
                            ' . $titleClm . '
1971
                            <td>' . htmlspecialchars($confKey) . '</td>
1972
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1973
                            <td>' . $paramExpanded . '</td>
1974
                            <td nowrap="nowrap">' . $urlList . '</td>
1975
                            <td nowrap="nowrap">' . $optionValues . '</td>
1976
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1977
                        </tr>';
1978
                } else {
1979
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1980
                            ' . $titleClm . '
1981
                            <td>' . htmlspecialchars($confKey) . '</td>
1982
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1983
                        </tr>';
1984
                }
1985
1986
                $c++;
1987
            }
1988
        } else {
1989
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1990
1991
            // Compile row:
1992
            $content .= '
1993
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1994
                    <td>' . $pageTitleAndIcon . '</td>
1995
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1996
                </tr>';
1997
        }
1998
1999
        return $content;
2000
    }
2001
2002
    /*****************************
2003
     *
2004
     * CLI functions
2005
     *
2006
     *****************************/
2007
2008
    /**
2009
     * Running the functionality of the CLI (crawling URLs from queue)
2010
     *
2011
     * @param int $countInARun
2012
     * @param int $sleepTime
2013
     * @param int $sleepAfterFinish
2014
     * @return string
2015
     */
2016
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2017
    {
2018
        $result = 0;
2019
        $counter = 0;
2020
2021
        // First, run hooks:
2022
        $this->CLI_runHooks();
2023
2024
        // Clean up the queue
2025
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2026
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2027
2028
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2029
            $del = $queryBuilderDelete
2030
                ->delete($this->tableName)
2031
                ->where(
2032
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2033
                )->execute();
2034
2035
            if (false === $del) {
2036
                $this->logger->info(
2037
                    'Records could not be deleted.'
2038
                );
2039
            }
2040
        }
2041
2042
        // Select entries:
2043
        //TODO Shouldn't this reside within the transaction?
2044
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2045
        $rows = $queryBuilderSelect
2046
            ->select('qid', 'scheduled')
2047
            ->from('tx_crawler_queue')
2048
            ->where(
2049
                $queryBuilderSelect->expr()->eq('exec_time', 0),
2050
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
2051
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
2052
            )
2053
            ->orderBy('scheduled')
2054
            ->addOrderBy('qid')
2055
            ->setMaxResults($countInARun)
2056
            ->execute()
2057
            ->fetchAll();
2058
2059
        if (!empty($rows)) {
2060
            $quidList = [];
2061
2062
            foreach ($rows as $r) {
2063
                $quidList[] = $r['qid'];
2064
            }
2065
2066
            $processId = $this->CLI_buildProcessId();
2067
2068
            //reserve queue entries for process
2069
2070
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2071
            //TODO make sure we're not taking assigned queue-entires
2072
2073
            //save the number of assigned queue entrys to determine who many have been processed later
2074
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2075
            $numberOfAffectedRows = $queryBuilderUpdate
2076
                ->update('tx_crawler_queue')
2077
                ->where(
2078
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
2079
                )
2080
                ->set('process_scheduled', $this->getCurrentTime())
2081
                ->set('process_id', $processId)
2082
                ->execute();
2083
2084
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2085
                ->update(
2086
                    'tx_crawler_process',
2087
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2088
                    [ 'process_id' => $processId ]
2089
                );
2090
2091
            if ($numberOfAffectedRows == count($quidList)) {
2092
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2093
            } else {
2094
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2095
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2096
                return ($result | self::CLI_STATUS_ABORTED);
2097
            }
2098
2099
            foreach ($rows as $r) {
2100
                $result |= $this->readUrl($r['qid']);
2101
2102
                $counter++;
2103
                usleep(intval($sleepTime)); // Just to relax the system
2104
2105
                // if during the start and the current read url the cli has been disable we need to return from the function
2106
                // mark the process NOT as ended.
2107
                if ($this->getDisabled()) {
2108
                    return ($result | self::CLI_STATUS_ABORTED);
2109
                }
2110
2111
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2112
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2113
2114
                    //TODO might need an additional returncode
2115
                    $result |= self::CLI_STATUS_ABORTED;
2116
                    break; //possible timeout
2117
                }
2118
            }
2119
2120
            sleep(intval($sleepAfterFinish));
2121
2122
            $msg = 'Rows: ' . $counter;
2123
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2124
        } else {
2125
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2126
        }
2127
2128
        if ($counter > 0) {
2129
            $result |= self::CLI_STATUS_PROCESSED;
2130
        }
2131
2132
        return $result;
2133
    }
2134
2135
    /**
2136
     * Activate hooks
2137
     *
2138
     * @return void
2139
     */
2140
    public function CLI_runHooks()
2141
    {
2142
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
2143
            $hookObj = GeneralUtility::makeInstance($objRef);
2144
            if (is_object($hookObj)) {
2145
                $hookObj->crawler_init($this);
2146
            }
2147
        }
2148
    }
2149
2150
    /**
2151
     * Try to acquire a new process with the given id
2152
     * also performs some auto-cleanup for orphan processes
2153
     * @todo preemption might not be the most elegant way to clean up
2154
     *
2155
     * @param string $id identification string for the process
2156
     * @return boolean
2157
     */
2158
    public function CLI_checkAndAcquireNewProcess($id)
2159
    {
2160
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2161
        $ret = true;
2162
2163
        $systemProcessId = getmypid();
2164
        if ($systemProcessId < 1) {
2165
            return false;
2166
        }
2167
2168
        $processCount = 0;
2169
        $orphanProcesses = [];
2170
2171
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2172
2173
        $statement = $queryBuilder
2174
            ->select('process_id', 'ttl')
2175
            ->from('tx_crawler_process')
2176
            ->where(
2177
                'active = 1 AND deleted = 0'
2178
            )
2179
            ->execute();
2180
2181
        $currentTime = $this->getCurrentTime();
2182
2183
        while ($row = $statement->fetch()) {
2184
            if ($row['ttl'] < $currentTime) {
2185
                $orphanProcesses[] = $row['process_id'];
2186
            } else {
2187
                $processCount++;
2188
            }
2189
        }
2190
2191
        // if there are less than allowed active processes then add a new one
2192
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2193
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2194
2195
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2196
                'tx_crawler_process',
2197
                [
2198
                    'process_id' => $id,
2199
                    'active' => 1,
2200
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2201
                    'system_process_id' => $systemProcessId
2202
                ]
2203
            );
2204
        } else {
2205
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2206
            $ret = false;
2207
        }
2208
2209
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2210
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2211
2212
        return $ret;
2213
    }
2214
2215
    /**
2216
     * Release a process and the required resources
2217
     *
2218
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2219
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2220
     * @return boolean
2221
     */
2222
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2223
    {
2224
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2225
2226
        if (!is_array($releaseIds)) {
2227
            $releaseIds = [$releaseIds];
2228
        }
2229
2230
        if (empty($releaseIds)) {
2231
            return false;   //nothing to release
2232
        }
2233
2234
        if (!$withinLock) {
2235
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2236
        }
2237
2238
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2239
        // this ensures that a single process can't mess up the entire process table
2240
2241
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2242
2243
        $queryBuilder
2244
        ->update('tx_crawler_queue', 'q')
2245
        ->where(
2246
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2247
        )
2248
        ->set('q.process_scheduled', 0)
2249
        ->set('q.process_id', '')
2250
        ->execute();
2251
2252
        // FIXME: Not entirely sure that this is equivalent to the previous version
2253
        $queryBuilder->resetQueryPart('set');
2254
2255
        $queryBuilder
2256
            ->update('tx_crawler_process')
2257
            ->where(
2258
                $queryBuilder->expr()->eq('active', 0),
2259
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2260
            )
2261
            ->set('system_process_id', 0)
2262
            ->execute();
2263
        // previous version for reference
2264
        /*
2265
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2266
            'tx_crawler_process',
2267
            'active=0 AND deleted=0
2268
            AND NOT EXISTS (
2269
                SELECT * FROM tx_crawler_queue
2270
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2271
                AND tx_crawler_queue.exec_time = 0
2272
            )',
2273
            [
2274
                'deleted' => '1',
2275
                'system_process_id' => 0
2276
            ]
2277
        );*/
2278
        // mark all requested processes as non-active
2279
        $queryBuilder
2280
            ->update('tx_crawler_process')
2281
            ->where(
2282
                'NOT EXISTS (
2283
                SELECT * FROM tx_crawler_queue
2284
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2285
                    AND tx_crawler_queue.exec_time = 0
2286
                )',
2287
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2288
                $queryBuilder->expr()->eq('deleted', 0)
2289
            )
2290
            ->set('active', 0)
2291
            ->execute();
2292
        $queryBuilder->resetQueryPart('set');
2293
        $queryBuilder
2294
            ->update('tx_crawler_queue')
2295
            ->where(
2296
                $queryBuilder->expr()->eq('exec_time', 0),
2297
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2298
            )
2299
            ->set('process_scheduled', 0)
2300
            ->set('process_id', '')
2301
            ->execute();
2302
2303
        if (!$withinLock) {
2304
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2305
        }
2306
2307
        return true;
2308
    }
2309
2310
    /**
2311
     * Check if there are still resources left for the process with the given id
2312
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2313
     *
2314
     * @param  string  identification string for the process
2315
     * @return boolean determines if the process is still active / has resources
2316
     *
2317
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2318
     */
2319 1
    public function CLI_checkIfProcessIsActive($pid)
2320
    {
2321 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2322 1
        $ret = false;
2323
2324
        $statement = $queryBuilder
2325 1
            ->from('tx_crawler_process')
2326 1
            ->select('active')
2327 1
            ->where(
2328 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2329
            )
2330 1
            ->orderBy('ttl')
2331 1
            ->execute();
2332
2333 1
        if ($row = $statement->fetch(0)) {
2334 1
            $ret = intVal($row['active']) == 1;
2335
        }
2336
2337 1
        return $ret;
2338
    }
2339
2340
    /**
2341
     * Create a unique Id for the current process
2342
     *
2343
     * @return string  the ID
2344
     */
2345 2
    public function CLI_buildProcessId()
2346
    {
2347 2
        if (!$this->processID) {
2348 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2349
        }
2350 2
        return $this->processID;
2351
    }
2352
2353
    /**
2354
     * @param bool $get_as_float
2355
     *
2356
     * @return mixed
2357
     */
2358
    protected function microtime($get_as_float = false)
2359
    {
2360
        return microtime($get_as_float);
2361
    }
2362
2363
    /**
2364
     * Prints a message to the stdout (only if debug-mode is enabled)
2365
     *
2366
     * @param  string $msg  the message
2367
     */
2368
    public function CLI_debug($msg)
2369
    {
2370
        if (intval($this->extensionSettings['processDebug'])) {
2371
            echo $msg . "\n";
2372
            flush();
2373
        }
2374
    }
2375
2376
    /**
2377
     * Get URL content by making direct request to TYPO3.
2378
     *
2379
     * @param  string $url          Page URL
2380
     * @param  int    $crawlerId    Crawler-ID
2381
     * @return array
2382
     */
2383 2
    protected function sendDirectRequest($url, $crawlerId)
2384
    {
2385 2
        $parsedUrl = parse_url($url);
2386 2
        if (!is_array($parsedUrl)) {
2387
            return [];
2388
        }
2389
2390 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2391
2392 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2393 2
        $cmd .= ' ';
2394 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2395 2
        $cmd .= ' ';
2396 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2397 2
        $cmd .= ' ';
2398 2
        $cmd .= escapeshellarg($url);
2399 2
        $cmd .= ' ';
2400 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2401
2402 2
        $startTime = microtime(true);
2403 2
        $content = $this->executeShellCommand($cmd);
2404 2
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
2405
2406
        $result = [
2407 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2408 2
            'headers' => '',
2409 2
            'content' => $content
2410
        ];
2411
2412 2
        return $result;
2413
    }
2414
2415
    /**
2416
     * Cleans up entries that stayed for too long in the queue. These are:
2417
     * - processed entries that are over 1.5 days in age
2418
     * - scheduled entries that are over 7 days old
2419
     *
2420
     * @return void
2421
     */
2422
    public function cleanUpOldQueueEntries()
2423
    {
2424
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2425
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2426
2427
        $now = time();
2428
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2429
        $this->flushQueue($condition);
2430
    }
2431
2432
    /**
2433
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2434
     *
2435
     * @param int $pageId
2436
     * @return void
2437
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2438
     * @throws \TYPO3\CMS\Core\Http\ImmediateResponseException
2439
     */
2440
    protected function initTSFE(int $pageId): void
2441
    {
2442
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(
2443
            TypoScriptFrontendController::class,
2444
            null,
2445
            $pageId,
2446
            0
2447
        );
2448
        $GLOBALS['TSFE']->initFEuser();
2449
        $GLOBALS['TSFE']->determineId();
2450
        $GLOBALS['TSFE']->getConfigArray();
2451
        $GLOBALS['TSFE']->settingLanguage();
2452
        $GLOBALS['TSFE']->settingLocale();
2453
        $GLOBALS['TSFE']->newCObj();
2454
    }
2455
2456
    /**
2457
     * Returns a md5 hash generated from a serialized configuration array.
2458
     *
2459
     * @param array $configuration
2460
     *
2461
     * @return string
2462
     */
2463 7
    protected function getConfigurationHash(array $configuration)
2464
    {
2465 7
        unset($configuration['paramExpanded']);
2466 7
        unset($configuration['URLs']);
2467 7
        return md5(serialize($configuration));
2468
    }
2469
2470
    /**
2471
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2472
     * the Site instance.
2473
     *
2474
     * @param int $pageId
2475
     * @param string $queryString
2476
     * @param string|null $alternativeBaseUrl
2477
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2478
     * @return UriInterface
2479
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2480
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2481
     */
2482 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2483
    {
2484 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2485 2
        if ($site instanceof Site) {
0 ignored issues
show
Bug introduced by
The class TYPO3\CMS\Core\Site\Entity\Site does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
2486
            $queryString = ltrim($queryString, '?&');
2487
            $queryParts = [];
2488
            parse_str($queryString, $queryParts);
2489
            unset($queryParts['id']);
2490
            // workaround as long as we don't have native language support in crawler configurations
2491
            if (isset($queryParts['L'])) {
2492
                $queryParts['_language'] = $queryParts['L'];
2493
                unset($queryParts['L']);
2494
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2495
            } else {
2496
                $siteLanguage = $site->getDefaultLanguage();
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2497
            }
2498
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2499
            if (!empty($alternativeBaseUrl)) {
2500
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2501
                $url = $url->withHost($alternativeBaseUrl->getHost());
2502
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2503
                $url = $url->withPort($alternativeBaseUrl->getPort());
2504
            }
2505
        } else {
2506
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2507
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2508 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2509 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2510 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2511 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2512 2
            $url = new Uri($url);
2513
        }
2514
2515 2
        if ($httpsOrHttp === -1) {
2516
            $url = $url->withScheme('http');
2517 2
        } elseif ($httpsOrHttp === 1) {
2518
            $url = $url->withScheme('https');
2519
        }
2520
2521 2
        return $url;
2522
    }
2523
}
2524