Completed
Push — remove/command-line-controller ( ce3e0b )
by Tomas Norre
14:48
created

CrawlerController::CLI_main_flush()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
nc 4
nop 0
dl 0
loc 27
ccs 0
cts 8
cp 0
crap 20
rs 9.488
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Domain\Model\Reason;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use AOE\Crawler\Utility\IconUtility;
33
use AOE\Crawler\Utility\SignalSlotUtility;
34
use TYPO3\CMS\Backend\Utility\BackendUtility;
35
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
36
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
37
use TYPO3\CMS\Core\Database\Connection;
38
use TYPO3\CMS\Core\Database\ConnectionPool;
39
use TYPO3\CMS\Core\Database\DatabaseConnection;
40
use TYPO3\CMS\Core\Log\Logger;
41
use TYPO3\CMS\Core\Log\LogLevel;
42
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
43
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
44
use TYPO3\CMS\Core\Utility\DebugUtility;
45
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
46
use TYPO3\CMS\Core\Utility\GeneralUtility;
47
use TYPO3\CMS\Core\Utility\MathUtility;
48
use TYPO3\CMS\Extbase\Object\ObjectManager;
49
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
50
use TYPO3\CMS\Frontend\Page\PageGenerator;
51
use TYPO3\CMS\Frontend\Page\PageRepository;
52
use TYPO3\CMS\Frontend\Utility\EidUtility;
53
use TYPO3\CMS\Lang\LanguageService;
54
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
55
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
56
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
59
/**
60
 * Class CrawlerController
61
 *
62
 * @package AOE\Crawler\Controller
63
 */
64
class CrawlerController
65
{
66
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
67
    const CLI_STATUS_REMAIN = 1; //queue not empty
68
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
69
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
70
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
71
72
    /**
73
     * @var integer
74
     */
75
    public $setID = 0;
76
77
    /**
78
     * @var string
79
     */
80
    public $processID = '';
81
82
    /**
83
     * One hour is max stalled time for the CLI
84
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
85
     *
86
     * @var integer
87
     */
88
    public $max_CLI_exec_time = 3600;
89
90
    /**
91
     * @var array
92
     */
93
    public $duplicateTrack = [];
94
95
    /**
96
     * @var array
97
     */
98
    public $downloadUrls = [];
99
100
    /**
101
     * @var array
102
     */
103
    public $incomingProcInstructions = [];
104
105
    /**
106
     * @var array
107
     */
108
    public $incomingConfigurationSelection = [];
109
110
    /**
111
     * @var bool
112
     */
113
    public $registerQueueEntriesInternallyOnly = false;
114
115
    /**
116
     * @var array
117
     */
118
    public $queueEntries = [];
119
120
    /**
121
     * @var array
122
     */
123
    public $urlList = [];
124
125
    /**
126
     * @var boolean
127
     */
128
    public $debugMode = false;
129
130
    /**
131
     * @var array
132
     */
133
    public $extensionSettings = [];
134
135
    /**
136
     * Mount Point
137
     *
138
     * @var boolean
139
     */
140
    public $MP = false;
141
142
    /**
143
     * @var string
144
     */
145
    protected $processFilename;
146
147
    /**
148
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
149
     *
150
     * @var string
151
     */
152
    protected $accessMode;
153
154
    /**
155
     * @var BackendUserAuthentication
156
     */
157
    private $backendUser;
158
159
    /**
160
     * @var integer
161
     */
162
    private $scheduledTime = 0;
163
164
    /**
165
     * @var integer
166
     */
167
    private $reqMinute = 0;
168
169
    /**
170
     * @var bool
171
     */
172
    private $submitCrawlUrls = false;
173
174
    /**
175
     * @var bool
176
     */
177
    private $downloadCrawlUrls = false;
178
179
    /**
180
     * @var QueueRepository
181
     */
182
    protected  $queueRepository;
183
184
    /**
185
     * @var ProcessRepository
186
     */
187
    protected $processRepository;
188
189
    /**
190
     * @var string
191
     */
192
    protected $tableName = 'tx_crawler_queue';
193
194
    /**
195
     * @var array
196
     */
197
    private $cliArgs;
198
199
200
    /**
201
     * @var Logger
202
     */
203
    private $logger;
204
205
    /**
206
     * Method to set the accessMode can be gui, cli or cli_im
207
     *
208
     * @return string
209
     */
210
    public function getAccessMode()
211
    {
212
        return $this->accessMode;
213
    }
214
215
    /**
216
     * @param string $accessMode
217
     */
218
    public function setAccessMode($accessMode)
219
    {
220
        $this->accessMode = $accessMode;
221
    }
222
223
    /**
224
     * Set disabled status to prevent processes from being processed
225
     *
226
     * @param  bool $disabled (optional, defaults to true)
227
     * @return void
228
     */
229
    public function setDisabled($disabled = true)
230
    {
231
        if ($disabled) {
232
            GeneralUtility::writeFile($this->processFilename, '');
233
        } else {
234
            if (is_file($this->processFilename)) {
235
                unlink($this->processFilename);
236
            }
237
        }
238
    }
239
240
    /**
241
     * Get disable status
242
     *
243
     * @return bool true if disabled
244
     */
245
    public function getDisabled()
246
    {
247
        return is_file($this->processFilename);
248
    }
249
250
    /**
251
     * @param string $filenameWithPath
252
     *
253
     * @return void
254
     */
255
    public function setProcessFilename($filenameWithPath)
256
    {
257
        $this->processFilename = $filenameWithPath;
258
    }
259
260
    /**
261
     * @return string
262
     */
263
    public function getProcessFilename()
264
    {
265
        return $this->processFilename;
266
    }
267
268
    /**
269
     * @return Logger
270
     */
271
    private function getLogger(): Logger {
272
        if($this->logger === null) {
273
            $this->logger = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Log\LogManager::class)->getLogger(__CLASS__);
274
        }
275
        return $this->logger;
276
    }
277
278
    /************************************
279
     *
280
     * Getting URLs based on Page TSconfig
281
     *
282
     ************************************/
283
284
    public function __construct()
285
    {
286
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
287
        $this->queueRepository = $objectManager->get(QueueRepository::class);
288
        $this->processRepository = $objectManager->get(ProcessRepository::class);
289
290
        $this->backendUser = $GLOBALS['BE_USER'];
291
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
292
293
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
294
        $settings = is_array($settings) ? $settings : [];
295
296
        // read ext_em_conf_template settings and set
297
        $this->setExtensionSettings($settings);
298
299
        // set defaults:
300
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
301
            $this->extensionSettings['countInARun'] = 100;
302
        }
303
304
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
305
    }
306
307
    /**
308
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
309
     *
310
     * @param array $extensionSettings
311
     * @return void
312
     */
313
    public function setExtensionSettings(array $extensionSettings)
314
    {
315
        $this->extensionSettings = $extensionSettings;
316
    }
317
318
    /**
319
     * Check if the given page should be crawled
320
     *
321
     * @param array $pageRow
322
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
323
     */
324
    public function checkIfPageShouldBeSkipped(array $pageRow)
325
    {
326
        $skipPage = false;
327
        $skipMessage = 'Skipped'; // message will be overwritten later
328
329
        // if page is hidden
330
        if (!$this->extensionSettings['crawlHiddenPages']) {
331
            if ($pageRow['hidden']) {
332
                $skipPage = true;
333
                $skipMessage = 'Because page is hidden';
334
            }
335
        }
336
337
        if (!$skipPage) {
338
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
339
                $skipPage = true;
340
                $skipMessage = 'Because doktype is not allowed';
341
            }
342
        }
343
344
        if (!$skipPage) {
345
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
346
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
347
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
348
                        $skipPage = true;
349
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
350
                        break;
351
                    }
352
                }
353
            }
354
        }
355
356
        if (!$skipPage) {
357
            // veto hook
358
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
359
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
360
                    $params = [
361
                        'pageRow' => $pageRow
362
                    ];
363
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
364
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
365
                    if ($veto !== false) {
366
                        $skipPage = true;
367
                        if (is_string($veto)) {
368
                            $skipMessage = $veto;
369
                        } else {
370
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
371
                        }
372
                        // no need to execute other hooks if a previous one return a veto
373
                        break;
374
                    }
375
                }
376
            }
377
        }
378
379
        return $skipPage ? $skipMessage : false;
380
    }
381
382
    /**
383
     * Wrapper method for getUrlsForPageId()
384
     * It returns an array of configurations and no urls!
385
     *
386
     * @param array $pageRow Page record with at least dok-type and uid columns.
387
     * @param string $skipMessage
388
     * @return array
389
     * @see getUrlsForPageId()
390
     */
391
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
392
    {
393
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
394
395
        if ($message === false) {
396
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
397
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
398
            $skipMessage = '';
399
        } else {
400
            $skipMessage = $message;
401
            $res = [];
402
        }
403
404
        return $res;
405
    }
406
407
    /**
408
     * This method is used to count if there are ANY unprocessed queue entries
409
     * of a given page_id and the configuration which matches a given hash.
410
     * If there if none, we can skip an inner detail check
411
     *
412
     * @param  int $uid
413
     * @param  string $configurationHash
414
     * @return boolean
415
     */
416
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
417
    {
418
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
419
        $noUnprocessedQueueEntriesFound = true;
420
421
        $result = $queryBuilder
422
            ->count('*')
423
            ->from($this->tableName)
424
            ->where(
425
                $queryBuilder->expr()->eq('page_id', intval($uid)),
426
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
427
                $queryBuilder->expr()->eq('exec_time', 0)
428
            )
429
            ->execute()
430
            ->fetchColumn();
431
432
        if ($result) {
433
            $noUnprocessedQueueEntriesFound = false;
434
        }
435
436
        return $noUnprocessedQueueEntriesFound;
437
    }
438
439
    /**
440
     * Creates a list of URLs from input array (and submits them to queue if asked for)
441
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
442
     *
443
     * @param    array        Information about URLs from pageRow to crawl.
444
     * @param    array        Page row
445
     * @param    integer        Unix time to schedule indexing to, typically time()
446
     * @param    integer        Number of requests per minute (creates the interleave between requests)
447
     * @param    boolean        If set, submits the URLs to queue
448
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
449
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
450
     * @param    array        Array which will be filled with URLS for download if flag is set.
451
     * @param    array        Array of processing instructions
452
     * @return    string        List of URLs (meant for display in backend module)
453
     *
454
     */
455
    public function urlListFromUrlArray(
456
    array $vv,
457
    array $pageRow,
458
    $scheduledTime,
459
    $reqMinute,
460
    $submitCrawlUrls,
461
    $downloadCrawlUrls,
462
    array &$duplicateTrack,
463
    array &$downloadUrls,
464
    array $incomingProcInstructions
465
    ) {
466
        $urlList = '';
467
        // realurl support (thanks to Ingo Renner)
468
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
469
470
            /** @var tx_realurl $urlObj */
471
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
472
473
            if (!empty($vv['subCfg']['baseUrl'])) {
474
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
475
                $host = strtolower($urlParts['host']);
476
                $urlObj->host = $host;
477
478
                // First pass, finding configuration OR pointer string:
479
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
480
481
                // If it turned out to be a string pointer, then look up the real config:
482
                if (is_string($urlObj->extConf)) {
483
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
484
                }
485
            }
486
487
            if (!$GLOBALS['TSFE']->sys_page) {
488
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
489
            }
490
491
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
492
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
493
            }
494
        }
495
496
        if (is_array($vv['URLs'])) {
497
            $configurationHash = $this->getConfigurationHash($vv);
498
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
499
500
            foreach ($vv['URLs'] as $urlQuery) {
501
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
502
503
                    // Calculate cHash:
504
                    if ($vv['subCfg']['cHash']) {
505
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
506
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
507
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
508
                    }
509
510
                    // Create key by which to determine unique-ness:
511
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
512
513
                    // realurl support (thanks to Ingo Renner)
514
                    $urlQuery = 'index.php' . $urlQuery;
515
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
516
                        $params = [
517
                            'LD' => [
518
                                'totalURL' => $urlQuery
519
                            ],
520
                            'TCEmainHook' => true
521
                        ];
522
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
523
                        $urlQuery = $params['LD']['totalURL'];
524
                    }
525
526
                    // Scheduled time:
527
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
528
                    $schTime = floor($schTime / 60) * 60;
529
530
                    if (isset($duplicateTrack[$uKey])) {
531
532
                        //if the url key is registered just display it and do not resubmit is
533
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
534
                    } else {
535
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
536
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
537
538
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
539
540
                        // Submit for crawling!
541
                        if ($submitCrawlUrls) {
542
                            $added = $this->addUrl(
543
                            $pageRow['uid'],
544
                            $theUrl,
545
                            $vv['subCfg'],
546
                            $scheduledTime,
547
                            $configurationHash,
548
                            $skipInnerCheck
549
                            );
550
                            if ($added === false) {
551
                                $urlList .= ' (Url already existed)';
552
                            }
553
                        } elseif ($downloadCrawlUrls) {
554
                            $downloadUrls[$theUrl] = $theUrl;
555
                        }
556
557
                        $urlList .= '<br />';
558
                    }
559
                    $duplicateTrack[$uKey] = true;
560
                }
561
            }
562
        } else {
563
            $urlList = 'ERROR - no URL generated';
564
        }
565
566
        return $urlList;
567
    }
568
569
    /**
570
     * Returns true if input processing instruction is among registered ones.
571
     *
572
     * @param string $piString PI to test
573
     * @param array $incomingProcInstructions Processing instructions
574
     * @return boolean
575
     */
576
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
577
    {
578
        if (empty($incomingProcInstructions)) {
579
            return true;
580
        }
581
582
        foreach ($incomingProcInstructions as $pi) {
583
            if (GeneralUtility::inList($piString, $pi)) {
584
                return true;
585
            }
586
        }
587
    }
588
589
    public function getPageTSconfigForId($id)
590
    {
591
        if (!$this->MP) {
592
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
593
        } else {
594
            list(, $mountPointId) = explode('-', $this->MP);
595
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
596
        }
597
598
        // Call a hook to alter configuration
599
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
600
            $params = [
601
                'pageId' => $id,
602
                'pageTSConfig' => &$pageTSconfig
603
            ];
604
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
605
                GeneralUtility::callUserFunction($userFunc, $params, $this);
606
            }
607
        }
608
609
        return $pageTSconfig;
610
    }
611
612
    /**
613
     * This methods returns an array of configurations.
614
     * And no urls!
615
     *
616
     * @param integer $id Page ID
617
     * @param bool $forceSsl Use https
618
     * @return array
619
     */
620
    public function getUrlsForPageId($id, $forceSsl = false)
621
    {
622
623
        /**
624
         * Get configuration from tsConfig
625
         */
626
627
        // Get page TSconfig for page ID:
628
        $pageTSconfig = $this->getPageTSconfigForId($id);
629
630
        $res = [];
631
632
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
633
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
634
635
            if (is_array($crawlerCfg['paramSets.'])) {
636
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
637
                    if (is_array($values)) {
638
                        $key = str_replace('.', '', $key);
639
                        // Sub configuration for a single configuration string:
640
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
641
                        $subCfg['key'] = $key;
642
643
                        if (strcmp($subCfg['procInstrFilter'], '')) {
644
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
645
                        }
646
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
647
648
                        // process configuration if it is not page-specific or if the specific page is the current page:
649
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
650
651
                                // add trailing slash if not present
652
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
653
                                $subCfg['baseUrl'] .= '/';
654
                            }
655
656
                            // Explode, process etc.:
657
                            $res[$key] = [];
658
                            $res[$key]['subCfg'] = $subCfg;
659
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
660
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
661
                            $res[$key]['origin'] = 'pagets';
662
663
                            // recognize MP value
664
                            if (!$this->MP) {
665
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
666
                            } else {
667
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
668
                            }
669
                        }
670
                    }
671
                }
672
            }
673
        }
674
675
        /**
676
         * Get configuration from tx_crawler_configuration records
677
         */
678
679
        // get records along the rootline
680
        $rootLine = BackendUtility::BEgetRootLine($id);
681
682
683
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
684
        $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
685
686
        foreach ($rootLine as $page) {
687
            $configurationRecordsForCurrentPage = $queryBuilder
688
                ->select('*')
689
                ->from('tx_crawler_configuration')
690
                ->where(
691
                    $queryBuilder->expr()->eq('pid', $page['uid']),
692
                    substr(BackendUtility::BEenableFields('tx_crawler_configuration'), 4) . BackendUtility::deleteClause('tx_crawler_configuration')
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...Utility::deleteClause() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0, the DeletedRestriction functionality should be used instead.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
693
                )
694
                ->execute()
695
                ->fetchAll();
696
697
            if (is_array($configurationRecordsForCurrentPage)) {
698
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
699
700
                        // check access to the configuration record
701
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
702
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
703
704
                        // process configuration if it is not page-specific or if the specific page is the current page:
705
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
706
                            $key = $configurationRecord['name'];
707
708
                            // don't overwrite previously defined paramSets
709
                            if (!isset($res[$key])) {
710
711
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
712
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
713
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
714
715
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
716
717
                                $subCfg = [
718
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
719
                                    'procInstrParams.' => $TSparserObject->setup,
720
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
721
                                        $configurationRecord['base_url'],
722
                                        $configurationRecord['sys_domain_base_url'],
723
                                        $isCrawlingProtocolHttps
724
                                    ),
725
                                    'realurl' => $configurationRecord['realurl'],
726
                                    'cHash' => $configurationRecord['chash'],
727
                                    'userGroups' => $configurationRecord['fegroups'],
728
                                    'exclude' => $configurationRecord['exclude'],
729
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
730
                                    'key' => $key
731
                                ];
732
733
                                // add trailing slash if not present
734
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
735
                                    $subCfg['baseUrl'] .= '/';
736
                                }
737
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
738
                                    $res[$key] = [];
739
                                    $res[$key]['subCfg'] = $subCfg;
740
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
741
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
742
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
743
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
744
                                }
745
                            }
746
                        }
747
                    }
748
                }
749
            }
750
        }
751
752
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
753
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
754
                $params = [
755
                    'res' => &$res,
756
                ];
757
                GeneralUtility::callUserFunction($func, $params, $this);
758
            }
759
        }
760
761
        return $res;
762
    }
763
764
    /**
765
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
766
     *
767
     * @param string $baseUrl
768
     * @param integer $sysDomainUid
769
     * @param bool $ssl
770
     * @return string
771
     */
772
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
773
    {
774
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
775
        $sysDomainUid = intval($sysDomainUid);
776
        $urlScheme = ($ssl === false) ? 'http' : 'https';
777
778
        if ($sysDomainUid > 0) {
779
            $statement = $queryBuilder
780
                ->from('sys_domain')
781
                ->select('*')
782
                ->where(
783
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
784
                )
785
                ->execute();
786
787
            $row = $statement->fetch(0);
788
            if ($row['domainName'] != '') {
789
                return $urlScheme . '://' . $row['domainName'];
790
            }
791
        }
792
        return $baseUrl;
793
    }
794
795
    /**
796
     * @param $rootid
797
     * @param $depth
798
     * @return array
799
     *
800
     * TODO: Write Functional Tests
801
     */
802
    public function getConfigurationsForBranch($rootid, $depth)
803
    {
804
        $configurationsForBranch = [];
805
806
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
807
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
808
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
809
            if (is_array($sets)) {
810
                foreach ($sets as $key => $value) {
811
                    if (!is_array($value)) {
812
                        continue;
813
                    }
814
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
815
                }
816
            }
817
        }
818
        $pids = [];
819
        $rootLine = BackendUtility::BEgetRootLine($rootid);
820
        foreach ($rootLine as $node) {
821
            $pids[] = $node['uid'];
822
        }
823
        /* @var PageTreeView $tree */
824
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
825
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
826
        $tree->init('AND ' . $perms_clause);
827
        $tree->getTree($rootid, $depth, '');
828
        foreach ($tree->tree as $node) {
829
            $pids[] = $node['row']['uid'];
830
        }
831
832
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
833
834
        $queryBuilder->getRestrictions()
835
            ->removeAll()
836
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
837
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
838
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
839
840
        $statement = $queryBuilder
841
            ->select('name')
842
            ->from('tx_crawler_configuration')
843
            ->where(
844
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
845
            )
846
        ->execute();
847
848
        while($row = $statement->fetch()) {
849
            $configurationsForBranch[] = $row['name'];
850
        }
851
852
        return $configurationsForBranch;
853
    }
854
855
    /**
856
     * Get querybuilder for given table
857
     *
858
     * @param string $table
859
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
860
     */
861
    private function getQueryBuilder(string $table) {
862
863
        return GeneralUtility::makeInstance(ConnectionPool::class)
864
            ->getConnectionForTable($table)
865
            ->createQueryBuilder();
866
    }
867
868
    /**
869
     * Check if a user has access to an item
870
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
871
     *
872
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
873
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
874
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
875
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
876
     */
877
    public function hasGroupAccess($groupList, $accessList)
878
    {
879
        if (empty($accessList)) {
880
            return true;
881
        }
882
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
883
            if (GeneralUtility::inList($accessList, $groupUid)) {
884
                return true;
885
            }
886
        }
887
        return false;
888
    }
889
890
    /**
891
     * Parse GET vars of input Query into array with key=>value pairs
892
     *
893
     * @param string $inputQuery Input query string
894
     * @return array
895
     */
896
    public function parseParams($inputQuery)
897
    {
898
        //echo '<pre>', var_dump($inputQuery), '</pre>';
899
        // Extract all GET parameters into an ARRAY:
900
        $paramKeyValues = [];
901
        $GETparams = explode('&', $inputQuery);
902
903
        foreach ($GETparams as $paramAndValue) {
904
            list($p, $v) = explode('=', $paramAndValue, 2);
905
            if (strlen($p)) {
906
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
907
            }
908
        }
909
910
        return $paramKeyValues;
911
    }
912
913
    /**
914
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
915
     * Syntax of values:
916
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
917
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
918
     * - For each configuration part:
919
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
920
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
921
     *        _ENABLELANG:1 picks only original records without their language overlays
922
     *         - Default: Literal value
923
     *
924
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
925
     * @param integer $pid Current page ID
926
     * @return array
927
     *
928
     * TODO: Write Functional Tests
929
     */
930
    public function expandParameters($paramArray, $pid)
931
    {
932
        global $TCA;
933
934
        // Traverse parameter names:
935
        foreach ($paramArray as $p => $v) {
936
            $v = trim($v);
937
938
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
939
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
940
                // So, find the value inside brackets and reset the paramArray value as an array.
941
                $v = substr($v, 1, -1);
942
                $paramArray[$p] = [];
943
944
                // Explode parts and traverse them:
945
                $parts = explode('|', $v);
946
                foreach ($parts as $pV) {
947
948
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
949
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
950
951
                        // Swap if first is larger than last:
952
                        if ($reg[1] > $reg[2]) {
953
                            $temp = $reg[2];
954
                            $reg[2] = $reg[1];
955
                            $reg[1] = $temp;
956
                        }
957
958
                        // Traverse range, add values:
959
                        $runAwayBrake = 1000; // Limit to size of range!
960
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
961
                            $paramArray[$p][] = $a;
962
                            $runAwayBrake--;
963
                            if ($runAwayBrake <= 0) {
964
                                break;
965
                            }
966
                        }
967
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
968
969
                        // Parse parameters:
970
                        $subparts = GeneralUtility::trimExplode(';', $pV);
971
                        $subpartParams = [];
972
                        foreach ($subparts as $spV) {
973
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
974
                            $subpartParams[$pKey] = $pVal;
975
                        }
976
977
                        // Table exists:
978
                        if (isset($TCA[$subpartParams['_TABLE']])) {
979
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
980
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
981
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
982
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
983
984
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
985
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
986
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
987
988
                                $queryBuilder->getRestrictions()
989
                                    ->removeAll()
990
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
991
992
                                $queryBuilder
993
                                    ->select($fieldName)
994
                                    ->from($subpartParams['_TABLE'])
995
                                    // TODO: Check if this works as intended!
996
                                    ->add('from', $addTable)
997
                                    ->where(
998
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
999
                                        $where
1000
                                    );
1001
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
1002
1003
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
1004
                                    $queryBuilder->andWhere(
1005
                                        $queryBuilder->expr()->lte(
1006
                                            $queryBuilder->quoteIdentifier($transOrigPointerField), 0
1007
                                        )
1008
                                    );
1009
                                }
1010
1011
                                $statement = $queryBuilder->execute();
1012
1013
                                $rows = [];
1014
                                while($row = $statement->fetch()) {
1015
                                    $rows[$fieldName] = $row;
1016
                                }
1017
1018
                                if (is_array($rows)) {
1019
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
1020
                                }
1021
                            }
1022
                        }
1023
                    } else { // Just add value:
1024
                        $paramArray[$p][] = $pV;
1025
                    }
1026
                    // Hook for processing own expandParameters place holder
1027
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
1028
                        $_params = [
1029
                            'pObj' => &$this,
1030
                            'paramArray' => &$paramArray,
1031
                            'currentKey' => $p,
1032
                            'currentValue' => $pV,
1033
                            'pid' => $pid
1034
                        ];
1035
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
1036
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
1037
                        }
1038
                    }
1039
                }
1040
1041
                // Make unique set of values and sort array by key:
1042
                $paramArray[$p] = array_unique($paramArray[$p]);
1043
                ksort($paramArray);
1044
            } else {
1045
                // Set the literal value as only value in array:
1046
                $paramArray[$p] = [$v];
1047
            }
1048
        }
1049
1050
        return $paramArray;
1051
    }
1052
1053
    /**
1054
     * Compiling URLs from parameter array (output of expandParameters())
1055
     * The number of URLs will be the multiplication of the number of parameter values for each key
1056
     *
1057
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1058
     * @param array $urls URLs accumulated in this array (for recursion)
1059
     * @return array
1060
     */
1061
    public function compileUrls($paramArray, $urls = [])
1062
    {
1063
        if (count($paramArray) && is_array($urls)) {
1064
            // shift first off stack:
1065
            reset($paramArray);
1066
            $varName = key($paramArray);
1067
            $valueSet = array_shift($paramArray);
1068
1069
            // Traverse value set:
1070
            $newUrls = [];
1071
            foreach ($urls as $url) {
1072
                foreach ($valueSet as $val) {
1073
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1074
1075
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1076
                        break;
1077
                    }
1078
                }
1079
            }
1080
            $urls = $newUrls;
1081
            $urls = $this->compileUrls($paramArray, $urls);
1082
        }
1083
1084
        return $urls;
1085
    }
1086
1087
    /************************************
1088
     *
1089
     * Crawler log
1090
     *
1091
     ************************************/
1092
1093
    /**
1094
     * Return array of records from crawler queue for input page ID
1095
     *
1096
     * @param integer $id Page ID for which to look up log entries.
1097
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1098
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1099
     * @param boolean $doFullFlush
1100
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1101
     * @return array
1102
     */
1103
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1104
    {
1105
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1106
        $queryBuilder
1107
            ->select('*')
1108
            ->from($this->tableName)
1109
            ->where(
1110
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1111
            )
1112
            ->orderBy('scheduled', 'DESC');
1113
1114
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1115
            ->getConnectionForTable($this->tableName)
1116
            ->getExpressionBuilder();
1117
        $query = $expressionBuilder->andX();
1118
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1119
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1120
        // between the statements, it's not a mistake in the code.
1121
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1122
        switch ($filter) {
1123
            case 'pending':
1124
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1125
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1126
                break;
1127
            case 'finished':
1128
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1129
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1130
                break;
1131
        }
1132
1133
        // FIXME: Write unit test that ensures that the right records are deleted.
1134
        if ($doFlush) {
1135
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1136
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1137
            return [];
1138
        } else {
1139
1140
            if($itemsPerPage > 0) {
1141
                $queryBuilder
1142
                    ->setMaxResults((int)$itemsPerPage);
1143
            }
1144
1145
            return $queryBuilder->execute()->fetchAll();
1146
        }
1147
    }
1148
1149
    /**
1150
     * Return array of records from crawler queue for input set ID
1151
     *
1152
     * @param integer $set_id Set ID for which to look up log entries.
1153
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1154
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1155
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1156
     * @return array
1157
     */
1158
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1159
    {
1160
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1161
        $queryBuilder
1162
            ->select('*')
1163
            ->from($this->tableName)
1164
            ->where(
1165
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1166
            )
1167
            ->orderBy('scheduled', 'DESC');
1168
1169
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1170
            ->getConnectionForTable($this->tableName)
1171
            ->getExpressionBuilder();
1172
        $query = $expressionBuilder->andX();
1173
        // FIXME: Write Unit tests for Filters
1174
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1175
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1176
        // between the statements, it's not a mistake in the code.
1177
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1178
        switch ($filter) {
1179
            case 'pending':
1180
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1181
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1182
                break;
1183
            case 'finished':
1184
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1185
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1186
                break;
1187
        }
1188
        // FIXME: Write unit test that ensures that the right records are deleted.
1189
        if ($doFlush) {
1190
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1191
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1192
            return [];
1193
        } else {
1194
            if($itemsPerPage > 0) {
1195
                $queryBuilder
1196
                    ->setMaxResults((int)$itemsPerPage);
1197
            }
1198
1199
            return $queryBuilder->execute()->fetchAll();
1200
        }
1201
    }
1202
1203
    /**
1204
     * Removes queue entries
1205
     *
1206
     * @param string $where SQL related filter for the entries which should be removed
1207
     * @return void
1208
     */
1209
    protected function flushQueue($where = '')
1210
    {
1211
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1212
1213
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1214
1215
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1216
1217
            $groups = $queryBuilder
1218
                ->select('DISTINCT set_id')
1219
                ->from($this->tableName)
1220
                ->where($realWhere)
1221
                ->execute()
1222
                ->fetchAll();
1223
            if (is_array($groups)) {
1224
                foreach ($groups as $group) {
1225
                    $subSet = $queryBuilder
1226
                        ->select('uid', 'set_id')
1227
                        ->from($this->tableName)
1228
                        ->where(
1229
                            $realWhere,
1230
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1231
                        )
1232
                        ->execute()
1233
                        ->fetchAll();
1234
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1235
                }
1236
            }
1237
        }
1238
1239
        $queryBuilder
1240
            ->delete($this->tableName)
1241
            ->where($realWhere)
1242
            ->execute();
1243
    }
1244
1245
    /**
1246
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1247
     *
1248
     * @param integer $setId Set ID
1249
     * @param array $params Parameters to pass to call back function
1250
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1251
     * @param integer $page_id Page ID to attach it to
1252
     * @param integer $schedule Time at which to activate
1253
     * @return void
1254
     */
1255
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1256
    {
1257
        if (!is_array($params)) {
1258
            $params = [];
1259
        }
1260
        $params['_CALLBACKOBJ'] = $callBack;
1261
1262
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1263
            ->insert(
1264
                'tx_crawler_queue',
1265
                [
1266
                    'page_id' => intval($page_id),
1267
                    'parameters' => serialize($params),
1268
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1269
                    'exec_time' => 0,
1270
                    'set_id' => intval($setId),
1271
                    'result_data' => '',
1272
                ]
1273
            );
1274
    }
1275
1276
    /************************************
1277
     *
1278
     * URL setting
1279
     *
1280
     ************************************/
1281
1282
    /**
1283
     * Setting a URL for crawling:
1284
     *
1285
     * @param integer $id Page ID
1286
     * @param string $url Complete URL
1287
     * @param array $subCfg Sub configuration array (from TS config)
1288
     * @param integer $tstamp Scheduled-time
1289
     * @param string $configurationHash (optional) configuration hash
1290
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1291
     * @return bool
1292
     */
1293
    public function addUrl(
1294
        $id,
1295
        $url,
1296
        array $subCfg,
1297
        $tstamp,
1298
        $configurationHash = '',
1299
        $skipInnerDuplicationCheck = false
1300
    ) {
1301
        $urlAdded = false;
1302
        $rows = [];
1303
1304
        // Creating parameters:
1305
        $parameters = [
1306
            'url' => $url
1307
        ];
1308
1309
        // fe user group simulation:
1310
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1311
        if ($uGs) {
1312
            $parameters['feUserGroupList'] = $uGs;
1313
        }
1314
1315
        // Setting processing instructions
1316
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1317
        if (is_array($subCfg['procInstrParams.'])) {
1318
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1319
        }
1320
1321
        // Possible TypoScript Template Parents
1322
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1323
1324
        // Compile value array:
1325
        $parameters_serialized = serialize($parameters);
1326
        $fieldArray = [
1327
            'page_id' => intval($id),
1328
            'parameters' => $parameters_serialized,
1329
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1330
            'configuration_hash' => $configurationHash,
1331
            'scheduled' => $tstamp,
1332
            'exec_time' => 0,
1333
            'set_id' => intval($this->setID),
1334
            'result_data' => '',
1335
            'configuration' => $subCfg['key'],
1336
        ];
1337
1338
        if ($this->registerQueueEntriesInternallyOnly) {
1339
            //the entries will only be registered and not stored to the database
1340
            $this->queueEntries[] = $fieldArray;
1341
        } else {
1342
            if (!$skipInnerDuplicationCheck) {
1343
                // check if there is already an equal entry
1344
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1345
            }
1346
1347
            if (count($rows) == 0) {
1348
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1349
                $connectionForCrawlerQueue->insert(
1350
                        'tx_crawler_queue',
1351
                        $fieldArray
1352
                    );
1353
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1354
                $rows[] = $uid;
1355
                $urlAdded = true;
1356
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1357
            } else {
1358
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1359
            }
1360
        }
1361
1362
        return $urlAdded;
1363
    }
1364
1365
    /**
1366
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1367
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1368
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1369
     *
1370
     * @param int $tstamp
1371
     * @param array $fieldArray
1372
     *
1373
     * @return array
1374
     *
1375
     * TODO: Write Functional Tests
1376
     */
1377
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1378
    {
1379
        $rows = [];
1380
1381
        $currentTime = $this->getCurrentTime();
1382
1383
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1384
        $queryBuilder
1385
            ->select('qid')
1386
            ->from('tx_crawler_queue');
1387
        //if this entry is scheduled with "now"
1388
        if ($tstamp <= $currentTime) {
1389
            if ($this->extensionSettings['enableTimeslot']) {
1390
                $timeBegin = $currentTime - 100;
1391
                $timeEnd = $currentTime + 100;
1392
                $queryBuilder
1393
                    ->where(
1394
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1395
                    )
1396
                    ->orWhere(
1397
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1398
                    );
1399
            } else {
1400
                $queryBuilder
1401
                    ->where(
1402
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1403
                    );
1404
            }
1405
        } elseif ($tstamp > $currentTime) {
1406
            //entry with a timestamp in the future need to have the same schedule time
1407
            $queryBuilder
1408
                ->where(
1409
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1410
                );
1411
        }
1412
1413
        $statement = $queryBuilder
1414
            ->andWhere('exec_time != 0')
1415
            ->andWhere('process_id != 0')
1416
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1417
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1418
            ->execute();
1419
1420
        while($row = $statement->fetch()) {
1421
            $rows[] = $row['qid'];
1422
        }
1423
1424
        return $rows;
1425
    }
1426
1427
    /**
1428
     * Returns the current system time
1429
     *
1430
     * @return int
1431
     */
1432
    public function getCurrentTime()
1433
    {
1434
        return time();
1435
    }
1436
1437
    /************************************
1438
     *
1439
     * URL reading
1440
     *
1441
     ************************************/
1442
1443
    /**
1444
     * Read URL for single queue entry
1445
     *
1446
     * @param integer $queueId
1447
     * @param boolean $force If set, will process even if exec_time has been set!
1448
     * @return integer
1449
     */
1450
    public function readUrl($queueId, $force = false)
1451
    {
1452
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1453
        $ret = 0;
1454
        if ($this->debugMode) {
1455
            $this->getLogger()->log(
1456
                LogLevel::DEBUG,
1457
                'crawler-readurl start ' . microtime(true)
1458
            );
1459
        }
1460
        // Get entry:
1461
        $queryBuilder
1462
            ->select('*')
1463
            ->from('tx_crawler_queue')
1464
            ->where(
1465
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1466
            );
1467
        if(!$force) {
1468
            $queryBuilder
1469
                ->andWhere('exec_time = 0')
1470
                ->andWhere('process_scheduled > 0');
1471
        }
1472
        $queueRec = $queryBuilder->execute()->fetch();
1473
1474
        if (!is_array($queueRec)) {
1475
            return;
1476
        }
1477
1478
        $parameters = unserialize($queueRec['parameters']);
1479
        if ($parameters['rootTemplatePid']) {
1480
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1481
        } else {
1482
            $this->getLogger()->log(
1483
                LogLevel::WARNING,
1484
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1485
            );
1486
        }
1487
1488
        SignalSlotUtility::emitSignal(
1489
            __CLASS__,
1490
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1491
            [$queueId, &$queueRec]
1492
        );
1493
1494
        // Set exec_time to lock record:
1495
        $field_array = ['exec_time' => $this->getCurrentTime()];
1496
1497
        if (isset($this->processID)) {
1498
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1499
            $field_array['process_id_completed'] = $this->processID;
1500
        }
1501
1502
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1503
            ->update(
1504
                'tx_crawler_queue',
1505
                $field_array,
1506
                [ 'qid' => (int)$queueId ]
1507
            );
1508
1509
        $result = $this->readUrl_exec($queueRec);
1510
        $resultData = unserialize($result['content']);
1511
1512
        //atm there's no need to point to specific pollable extensions
1513
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1514
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1515
                // only check the success value if the instruction is runnig
1516
                // it is important to name the pollSuccess key same as the procInstructions key
1517
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1518
                    $pollable,
1519
                        $resultData['parameters']['procInstructions']
1520
                )
1521
                ) {
1522
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1523
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1524
                    }
1525
                }
1526
            }
1527
        }
1528
1529
        // Set result in log which also denotes the end of the processing of this entry.
1530
        $field_array = ['result_data' => serialize($result)];
1531
1532
        SignalSlotUtility::emitSignal(
1533
            __CLASS__,
1534
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1535
            [$queueId, &$field_array]
1536
        );
1537
1538
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1539
            ->update(
1540
                'tx_crawler_queue',
1541
                $field_array,
1542
                [ 'qid' => (int)$queueId ]
1543
            );
1544
1545
        if ($this->debugMode) {
1546
            $this->getLogger()->log(
1547
                LogLevel::DEBUG,
1548
                'crawler-readurl stop ' . microtime(true)
1549
            );
1550
        }
1551
1552
        return $ret;
1553
    }
1554
1555
    /**
1556
     * Read URL for not-yet-inserted log-entry
1557
     *
1558
     * @param array $field_array Queue field array,
1559
     *
1560
     * @return string
1561
     */
1562
    public function readUrlFromArray($field_array)
1563
    {
1564
1565
            // Set exec_time to lock record:
1566
        $field_array['exec_time'] = $this->getCurrentTime();
1567
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1568
        $connectionForCrawlerQueue->insert(
1569
            'tx_crawler_queue',
1570
            $field_array
1571
        );
1572
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1573
1574
        $result = $this->readUrl_exec($field_array);
1575
1576
        // Set result in log which also denotes the end of the processing of this entry.
1577
        $field_array = ['result_data' => serialize($result)];
1578
1579
        SignalSlotUtility::emitSignal(
1580
            __CLASS__,
1581
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1582
            [$queueId, &$field_array]
1583
        );
1584
1585
        $connectionForCrawlerQueue->update(
1586
            'tx_crawler_queue',
1587
            $field_array,
1588
            ['qid' => $queueId]
1589
        );
1590
1591
        return $result;
1592
    }
1593
1594
    /**
1595
     * Read URL for a queue record
1596
     *
1597
     * @param array $queueRec Queue record
1598
     * @return string
1599
     */
1600
    public function readUrl_exec($queueRec)
1601
    {
1602
        // Decode parameters:
1603
        $parameters = unserialize($queueRec['parameters']);
1604
        $result = 'ERROR';
1605
        if (is_array($parameters)) {
1606
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1607
                $objRef = $parameters['_CALLBACKOBJ'];
1608
                $callBackObj = GeneralUtility::makeInstance($objRef);
1609
                if (is_object($callBackObj)) {
1610
                    unset($parameters['_CALLBACKOBJ']);
1611
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1612
                } else {
1613
                    $result = ['content' => 'No object: ' . $objRef];
1614
                }
1615
            } else { // Regular FE request:
1616
1617
                // Prepare:
1618
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1619
1620
                // Get result:
1621
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1622
1623
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1624
            }
1625
        }
1626
1627
        return $result;
1628
    }
1629
1630
    /**
1631
     * Gets the content of a URL.
1632
     *
1633
     * @param string $originalUrl URL to read
1634
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1635
     * @param integer $timeout Timeout time
1636
     * @param integer $recursion Recursion limiter for 302 redirects
1637
     * @return array|boolean
1638
     */
1639
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1640
    {
1641
        if (!$recursion) {
1642
            return false;
1643
        }
1644
1645
        // Parse URL, checking for scheme:
1646
        $url = parse_url($originalUrl);
1647
1648
        if ($url === false) {
1649
            $this->getLogger()->log(
1650
                LogLevel::DEBUG,
1651
                sprintf('Could not parse_url() for string "%s"', $url),
1652
                ['crawlerId' => $crawlerId]
1653
            );
1654
            return false;
1655
        }
1656
1657
        if (!in_array($url['scheme'], ['','http','https'])) {
1658
            $this->getLogger()->log(
1659
                LogLevel::DEBUG,
1660
                sprintf('Scheme does not match for url "%s"', $url),
1661
                ['crawlerId' => $crawlerId]
1662
            );
1663
            return false;
1664
        }
1665
1666
        // direct request
1667
        if ($this->extensionSettings['makeDirectRequests']) {
1668
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1669
            return $result;
1670
        }
1671
1672
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1673
1674
        // thanks to Pierrick Caillon for adding proxy support
1675
        $rurl = $url;
1676
1677
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1678
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1679
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1680
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1681
        }
1682
1683
        $host = $rurl['host'];
1684
1685
        if ($url['scheme'] == 'https') {
1686
            $host = 'ssl://' . $host;
1687
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1688
        } else {
1689
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1690
        }
1691
1692
        $startTime = microtime(true);
1693
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1694
1695
        if (!$fp) {
1696
            $this->getLogger()->log(
1697
                LogLevel::DEBUG,
1698
                sprintf('Error while opening "%s"', $url),
1699
                ['crawlerId' => $crawlerId]
1700
            );
1701
            return false;
1702
        } else {
1703
            // Request message:
1704
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1705
            fputs($fp, $msg);
1706
1707
            // Read response:
1708
            $d = $this->getHttpResponseFromStream($fp);
1709
            fclose($fp);
1710
1711
            $time = microtime(true) - $startTime;
1712
            $this->log($originalUrl . ' ' . $time);
1713
1714
            // Implode content and headers:
1715
            $result = [
1716
                'request' => $msg,
1717
                'headers' => implode('', $d['headers']),
1718
                'content' => implode('', (array)$d['content'])
1719
            ];
1720
1721
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1722
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1721 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1723
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1721 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1724
1725
                if (is_array($newRequestUrl)) {
1726
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1727
                } else {
1728
                    $this->getLogger()->log(
1729
                        LogLevel::DEBUG,
1730
                        sprintf('Error while opening "%s"', $url),
1731
                        ['crawlerId' => $crawlerId]
1732
                    );
1733
                    return false;
1734
                }
1735
            }
1736
1737
            return $result;
1738
        }
1739
    }
1740
1741
    /**
1742
     * Gets the base path of the website frontend.
1743
     * (e.g. if you call http://mydomain.com/cms/index.php in
1744
     * the browser the base path is "/cms/")
1745
     *
1746
     * @return string Base path of the website frontend
1747
     */
1748
    protected function getFrontendBasePath()
1749
    {
1750
        $frontendBasePath = '/';
1751
1752
        // Get the path from the extension settings:
1753
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1754
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1755
            // If empty, try to use config.absRefPrefix:
1756
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1757
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1758
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1759
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1760
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1761
        }
1762
1763
        // Base path must be '/<pathSegements>/':
1764
        if ($frontendBasePath !== '/') {
1765
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1766
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1767
        }
1768
1769
        return $frontendBasePath;
1770
    }
1771
1772
    /**
1773
     * Executes a shell command and returns the outputted result.
1774
     *
1775
     * @param string $command Shell command to be executed
1776
     * @return string Outputted result of the command execution
1777
     */
1778
    protected function executeShellCommand($command)
1779
    {
1780
        return shell_exec($command);
1781
    }
1782
1783
    /**
1784
     * Reads HTTP response from the given stream.
1785
     *
1786
     * @param  resource $streamPointer  Pointer to connection stream.
1787
     * @return array                    Associative array with the following items:
1788
     *                                  headers <array> Response headers sent by server.
1789
     *                                  content <array> Content, with each line as an array item.
1790
     */
1791
    protected function getHttpResponseFromStream($streamPointer)
1792
    {
1793
        $response = ['headers' => [], 'content' => []];
1794
1795
        if (is_resource($streamPointer)) {
1796
            // read headers
1797
            while ($line = fgets($streamPointer, '2048')) {
1798
                $line = trim($line);
1799
                if ($line !== '') {
1800
                    $response['headers'][] = $line;
1801
                } else {
1802
                    break;
1803
                }
1804
            }
1805
1806
            // read content
1807
            while ($line = fgets($streamPointer, '2048')) {
1808
                $response['content'][] = $line;
1809
            }
1810
        }
1811
1812
        return $response;
1813
    }
1814
1815
    /**
1816
     * @param message
1817
     */
1818
    protected function log($message)
1819
    {
1820
        if (!empty($this->extensionSettings['logFileName'])) {
1821
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1822
            if (!$fileResult) {
1823
1824
                $this->getLogger()->log(
1825
                    LogLevel::INFO,
1826
                    sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1827
                );
1828
            }
1829
        }
1830
    }
1831
1832
    /**
1833
     * Builds HTTP request headers.
1834
     *
1835
     * @param array $url
1836
     * @param string $crawlerId
1837
     *
1838
     * @return array
1839
     */
1840
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1841
    {
1842
        $reqHeaders = [];
1843
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1844
        $reqHeaders[] = 'Host: ' . $url['host'];
1845
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1846
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1847
        }
1848
        $reqHeaders[] = 'Connection: close';
1849
        if ($url['user'] != '') {
1850
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1851
        }
1852
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1853
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1854
        return $reqHeaders;
1855
    }
1856
1857
    /**
1858
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1859
     *
1860
     * @param array $headers HTTP Header
1861
     * @param string $user HTTP Auth. User
1862
     * @param string $pass HTTP Auth. Password
1863
     * @return bool|string
1864
     */
1865
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1866
    {
1867
        $header = [];
1868
        if (!is_array($headers)) {
1869
            return false;
1870
        }
1871
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1872
            return false;
1873
        }
1874
1875
        foreach ($headers as $hl) {
1876
            $tmp = explode(": ", $hl);
1877
            $header[trim($tmp[0])] = trim($tmp[1]);
1878
            if (trim($tmp[0]) == 'Location') {
1879
                break;
1880
            }
1881
        }
1882
        if (!array_key_exists('Location', $header)) {
1883
            return false;
1884
        }
1885
1886
        if ($user != '') {
1887
            if (!($tmp = parse_url($header['Location']))) {
1888
                return false;
1889
            }
1890
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1891
            if ($tmp['query'] != '') {
1892
                $newUrl .= '?' . $tmp['query'];
1893
            }
1894
        } else {
1895
            $newUrl = $header['Location'];
1896
        }
1897
        return $newUrl;
1898
    }
1899
1900
    /**************************
1901
     *
1902
     * tslib_fe hooks:
1903
     *
1904
     **************************/
1905
1906
    /**
1907
     * Initialization hook (called after database connection)
1908
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1909
     *
1910
     * @param array $params Parameters from frontend
1911
     * @param object $ref TSFE object (reference under PHP5)
1912
     * @return void
1913
     *
1914
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1915
     * FIXME: I think this can be removed. (TNM)
1916
     */
1917
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1918
    {
1919
        // Authenticate crawler request:
1920
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1921
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1922
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1923
1924
            $queueRec = $queryBuilder
1925
                ->select('*')
1926
                ->from('tx_crawler_queue')
1927
                ->where(
1928
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1929
                )
1930
                ->execute()
1931
                ->fetch();
1932
1933
            // If a crawler record was found and hash was matching, set it up:
1934
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1935
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1936
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1937
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1938
            } else {
1939
                die('No crawler entry found!');
1940
            }
1941
        }
1942
    }
1943
1944
    /*****************************
1945
     *
1946
     * Compiling URLs to crawl - tools
1947
     *
1948
     *****************************/
1949
1950
    /**
1951
     * @param integer $id Root page id to start from.
1952
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1953
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1954
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1955
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1956
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1957
     * @param array $incomingProcInstructions Array of processing instructions
1958
     * @param array $configurationSelection Array of configuration keys
1959
     * @return string
1960
     */
1961
    public function getPageTreeAndUrls(
1962
        $id,
1963
        $depth,
1964
        $scheduledTime,
1965
        $reqMinute,
1966
        $submitCrawlUrls,
1967
        $downloadCrawlUrls,
1968
        array $incomingProcInstructions,
1969
        array $configurationSelection
1970
    ) {
1971
        global $BACK_PATH;
1972
        global $LANG;
1973
        if (!is_object($LANG)) {
1974
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1975
            $LANG->init(0);
1976
        }
1977
        $this->scheduledTime = $scheduledTime;
1978
        $this->reqMinute = $reqMinute;
1979
        $this->submitCrawlUrls = $submitCrawlUrls;
1980
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1981
        $this->incomingProcInstructions = $incomingProcInstructions;
1982
        $this->incomingConfigurationSelection = $configurationSelection;
1983
1984
        $this->duplicateTrack = [];
1985
        $this->downloadUrls = [];
1986
1987
        // Drawing tree:
1988
        /* @var PageTreeView $tree */
1989
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1990
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1991
        $tree->init('AND ' . $perms_clause);
1992
1993
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1994
        if (is_array($pageInfo)) {
1995
            // Set root row:
1996
            $tree->tree[] = [
1997
                'row' => $pageInfo,
1998
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1999
            ];
2000
        }
2001
2002
        // Get branch beneath:
2003
        if ($depth) {
2004
            $tree->getTree($id, $depth, '');
2005
        }
2006
2007
        // Traverse page tree:
2008
        $code = '';
2009
2010
        foreach ($tree->tree as $data) {
2011
            $this->MP = false;
2012
2013
            // recognize mount points
2014
            if ($data['row']['doktype'] == 7) {
2015
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2016
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
2017
                $mountpage = $queryBuilder
2018
                    ->select('*')
2019
                    ->from('pages')
2020
                    ->where(
2021
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
2022
                    )
2023
                    ->execute()
2024
                    ->fetchAll();
2025
                $queryBuilder->getRestrictions()->reset();
2026
2027
                // fetch mounted pages
2028
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
2029
2030
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
2031
                $mountTree->init('AND ' . $perms_clause);
2032
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
2033
2034
                foreach ($mountTree->tree as $mountData) {
2035
                    $code .= $this->drawURLs_addRowsForPage(
2036
                        $mountData['row'],
2037
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
2038
                    );
2039
                }
2040
2041
                // replace page when mount_pid_ol is enabled
2042
                if ($mountpage[0]['mount_pid_ol']) {
2043
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
2044
                } else {
2045
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
2046
                    $this->MP = false;
2047
                }
2048
            }
2049
2050
            $code .= $this->drawURLs_addRowsForPage(
2051
                $data['row'],
2052
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
2053
            );
2054
        }
2055
2056
        return $code;
2057
    }
2058
2059
    /**
2060
     * Expands exclude string
2061
     *
2062
     * @param string $excludeString Exclude string
2063
     * @return array
2064
     */
2065
    public function expandExcludeString($excludeString)
2066
    {
2067
        // internal static caches;
2068
        static $expandedExcludeStringCache;
2069
        static $treeCache;
2070
2071
        if (empty($expandedExcludeStringCache[$excludeString])) {
2072
            $pidList = [];
2073
2074
            if (!empty($excludeString)) {
2075
                /** @var PageTreeView $tree */
2076
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
2077
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
2078
2079
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2080
2081
                foreach ($excludeParts as $excludePart) {
2082
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2083
2084
                    // default is "page only" = "depth=0"
2085
                    if (empty($depth)) {
2086
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2087
                    }
2088
2089
                    $pidList[] = $pid;
2090
2091
                    if ($depth > 0) {
2092
                        if (empty($treeCache[$pid][$depth])) {
2093
                            $tree->reset();
2094
                            $tree->getTree($pid, $depth);
2095
                            $treeCache[$pid][$depth] = $tree->tree;
2096
                        }
2097
2098
                        foreach ($treeCache[$pid][$depth] as $data) {
2099
                            $pidList[] = $data['row']['uid'];
2100
                        }
2101
                    }
2102
                }
2103
            }
2104
2105
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2106
        }
2107
2108
        return $expandedExcludeStringCache[$excludeString];
2109
    }
2110
2111
    /**
2112
     * Create the rows for display of the page tree
2113
     * For each page a number of rows are shown displaying GET variable configuration
2114
     *
2115
     * @param    array        Page row
2116
     * @param    string        Page icon and title for row
2117
     * @return    string        HTML <tr> content (one or more)
2118
     */
2119
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2120
    {
2121
        $skipMessage = '';
2122
2123
        // Get list of configurations
2124
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2125
2126
        if (count($this->incomingConfigurationSelection) > 0) {
2127
            // remove configuration that does not match the current selection
2128
            foreach ($configurations as $confKey => $confArray) {
2129
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2130
                    unset($configurations[$confKey]);
2131
                }
2132
            }
2133
        }
2134
2135
        // Traverse parameter combinations:
2136
        $c = 0;
2137
        $content = '';
2138
        if (count($configurations)) {
2139
            foreach ($configurations as $confKey => $confArray) {
2140
2141
                    // Title column:
2142
                if (!$c) {
2143
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2144
                } else {
2145
                    $titleClm = '';
2146
                }
2147
2148
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2149
2150
                        // URL list:
2151
                    $urlList = $this->urlListFromUrlArray(
2152
                        $confArray,
2153
                        $pageRow,
2154
                        $this->scheduledTime,
2155
                        $this->reqMinute,
2156
                        $this->submitCrawlUrls,
2157
                        $this->downloadCrawlUrls,
2158
                        $this->duplicateTrack,
2159
                        $this->downloadUrls,
2160
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2161
                    );
2162
2163
                    // Expanded parameters:
2164
                    $paramExpanded = '';
2165
                    $calcAccu = [];
2166
                    $calcRes = 1;
2167
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2168
                        $paramExpanded .= '
2169
                            <tr>
2170
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2171
                                                '(' . count($gVal) . ')' .
2172
                                                '</td>
2173
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2174
                            </tr>
2175
                        ';
2176
                        $calcRes *= count($gVal);
2177
                        $calcAccu[] = count($gVal);
2178
                    }
2179
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2180
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2181
2182
                    // Options
2183
                    $optionValues = '';
2184
                    if ($confArray['subCfg']['userGroups']) {
2185
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2186
                    }
2187
                    if ($confArray['subCfg']['baseUrl']) {
2188
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2189
                    }
2190
                    if ($confArray['subCfg']['procInstrFilter']) {
2191
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2192
                    }
2193
2194
                    // Compile row:
2195
                    $content .= '
2196
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2197
                            ' . $titleClm . '
2198
                            <td>' . htmlspecialchars($confKey) . '</td>
2199
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2200
                            <td>' . $paramExpanded . '</td>
2201
                            <td nowrap="nowrap">' . $urlList . '</td>
2202
                            <td nowrap="nowrap">' . $optionValues . '</td>
2203
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2204
                        </tr>';
2205
                } else {
2206
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2207
                            ' . $titleClm . '
2208
                            <td>' . htmlspecialchars($confKey) . '</td>
2209
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2210
                        </tr>';
2211
                }
2212
2213
                $c++;
2214
            }
2215
        } else {
2216
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2217
2218
            // Compile row:
2219
            $content .= '
2220
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2221
                    <td>' . $pageTitleAndIcon . '</td>
2222
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2223
                </tr>';
2224
        }
2225
2226
        return $content;
2227
    }
2228
2229
    /*****************************
2230
     *
2231
     * CLI functions
2232
     *
2233
     *****************************/
2234
2235
    /**
2236
     * Helper function
2237
     *
2238
     * @param string $option Option string, eg. "-s
2239
     * @param int $idx Value index, default is 0 (zero) = the first one...
2240
     * @return string
2241
     */
2242
    private function cli_argValue($option, $idx) {
2243
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
0 ignored issues
show
Bug introduced by
The property cli_args does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
2244
    }
2245
2246
    /**
2247
     * Helper function
2248
     *
2249
     * @param string $string The string to output
2250
     */
2251
    private function cli_echo($string) {
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2252
        $this->outputLine($string);
0 ignored issues
show
Bug introduced by
The method outputLine() does not seem to exist on object<AOE\Crawler\Controller\CrawlerController>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2253
    }
2254
2255
    /**
2256
     * Set cli args
2257
     *
2258
     * This is a copy from the CommandLineController from TYPO3 < v9
2259
     *
2260
     * TODO: Rework
2261
     *
2262
     * @param array $argv
2263
     */
2264
    private function setCliArgs(array $argv) {
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2265
        $cli_options = [];
2266
        $index = '_DEFAULT';
2267
        foreach ($argv as $token) {
2268
            // Options starting with a number is invalid - they could be negative values!
2269
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2270
                list($index, $opt) = explode('=', $token, 2);
2271
                if (isset($cli_options[$index])) {
2272
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2273
                    die;
2274
                }
2275
                $cli_options[$index] = [];
2276
                if (isset($opt)) {
2277
                    $cli_options[$index][] = $opt;
2278
                }
2279
            } else {
2280
                $cli_options[$index][] = $token;
2281
            }
2282
        }
2283
2284
        $this->cliArgs = $cli_options;
2285
    }
2286
2287
    /**
2288
     * Obtains configuration keys from the CLI arguments
2289
     *
2290
     * @return mixed                        Array of keys or null if no keys found
2291
     */
2292
    protected function getConfigurationKeys()
2293
    {
2294
        $parameter = trim($this->cli_argValue('-conf'));
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2295
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2296
    }
2297
2298
    /**
2299
     * Running the functionality of the CLI (crawling URLs from queue)
2300
     *
2301
     * @param int $countInARun
2302
     * @param int $sleepTime
2303
     * @param int $sleepAfterFinish
2304
     * @return string
2305
     */
2306
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2307
    {
2308
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2309
        $result = 0;
2310
        $counter = 0;
2311
2312
2313
2314
        // First, run hooks:
2315
        $this->CLI_runHooks();
2316
2317
        // Clean up the queue
2318
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2319
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2320
2321
            $del = $queryBuilder
2322
                ->delete($this->tableName)
2323
                ->where(
2324
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2325
                );
2326
            if (false == $del) {
2327
2328
                $this->getLogger()->log(
2329
                    LogLevel::INFO,
2330
                    'Records could not be deleted.'
2331
                );
2332
            }
2333
        }
2334
2335
        // Select entries:
2336
        //TODO Shouldn't this reside within the transaction?
2337
        $rows = $queryBuilder
2338
            ->select('qid', 'scheduled')
2339
            ->from('tx_crawler_queue')
2340
            ->where(
2341
                $queryBuilder->expr()->eq('exec_time', 0),
2342
                $queryBuilder->expr()->eq('process_scheduled', 0),
2343
                $queryBuilder->expr()->lte('scheduled',  $this->getCurrentTime())
2344
            )
2345
            ->orderBy('scheduled')
2346
            ->addOrderBy('qid')
2347
            ->setMaxResults($countInARun)
2348
            ->execute()
2349
            ->fetchAll();
2350
2351
        if (count($rows) > 0) {
2352
            $quidList = [];
2353
2354
            foreach ($rows as $r) {
2355
                $quidList[] = $r['qid'];
2356
            }
2357
2358
            $processId = $this->CLI_buildProcessId();
2359
2360
            //reserve queue entries for process
2361
2362
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2363
            //TODO make sure we're not taking assigned queue-entires
2364
2365
            //save the number of assigned queue entrys to determine who many have been processed later
2366
            $numberOfAffectedRows = $queryBuilder
2367
                ->update('tx_crawler_queue')
2368
                ->where(
2369
                    $queryBuilder->expr()->in('qid', $quidList)
2370
                )
2371
                ->set('process_scheduled', $queryBuilder->createNamedParamter($this->getCurrentTime(), \PDO::PARAM_INT))
2372
                ->set('process_id', $processId)
2373
                ->execute();
2374
2375
2376
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2377
                ->update(
2378
                    'tx_crawler_process',
2379
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2380
                    [ 'process_id' => (int) $processId ]
2381
                );
2382
2383
            if ($numberOfAffectedRows == count($quidList)) {
2384
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2385
            } else {
2386
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2387
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2388
                return ($result | self::CLI_STATUS_ABORTED);
2389
            }
2390
2391
            foreach ($rows as $r) {
2392
                $result |= $this->readUrl($r['qid']);
2393
2394
                $counter++;
2395
                usleep(intval($sleepTime)); // Just to relax the system
2396
2397
                // if during the start and the current read url the cli has been disable we need to return from the function
2398
                // mark the process NOT as ended.
2399
                if ($this->getDisabled()) {
2400
                    return ($result | self::CLI_STATUS_ABORTED);
2401
                }
2402
2403
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2404
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2405
2406
                    //TODO might need an additional returncode
2407
                    $result |= self::CLI_STATUS_ABORTED;
2408
                    break; //possible timeout
2409
                }
2410
            }
2411
2412
            sleep(intval($sleepAfterFinish));
2413
2414
            $msg = 'Rows: ' . $counter;
2415
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2416
        } else {
2417
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2418
        }
2419
2420
        if ($counter > 0) {
2421
            $result |= self::CLI_STATUS_PROCESSED;
2422
        }
2423
2424
        return $result;
2425
    }
2426
2427
    /**
2428
     * Activate hooks
2429
     *
2430
     * @return void
2431
     */
2432
    public function CLI_runHooks()
2433
    {
2434
        global $TYPO3_CONF_VARS;
2435
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2436
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2437
                $hookObj = GeneralUtility::makeInstance($objRef);
2438
                if (is_object($hookObj)) {
2439
                    $hookObj->crawler_init($this);
2440
                }
2441
            }
2442
        }
2443
    }
2444
2445
    /**
2446
     * Try to acquire a new process with the given id
2447
     * also performs some auto-cleanup for orphan processes
2448
     * @todo preemption might not be the most elegant way to clean up
2449
     *
2450
     * @param string $id identification string for the process
2451
     * @return boolean
2452
     */
2453
    public function CLI_checkAndAcquireNewProcess($id)
2454
    {
2455
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2456
        $ret = true;
2457
2458
        $systemProcessId = getmypid();
2459
        if ($systemProcessId < 1) {
2460
            return false;
2461
        }
2462
2463
        $processCount = 0;
2464
        $orphanProcesses = [];
2465
2466
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2467
2468
        $statement = $queryBuilder
2469
            ->select('process_id', 'ttl')
2470
            ->from('tx_crawler_process')
2471
            ->where(
2472
                'active = 1 AND deleted = 0'
2473
            )
2474
            ->execute();
2475
2476
        $currentTime = $this->getCurrentTime();
2477
2478
        while ($row = $statement->fetch()) {
2479
            if ($row['ttl'] < $currentTime) {
2480
                $orphanProcesses[] = $row['process_id'];
2481
            } else {
2482
                $processCount++;
2483
            }
2484
        }
2485
2486
        // if there are less than allowed active processes then add a new one
2487
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2488
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2489
2490
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2491
                'tx_crawler_process',
2492
                [
2493
                    'process_id' => $id,
2494
                    'active' => 1,
2495
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2496
                    'system_process_id' => $systemProcessId
2497
                ]
2498
            );
2499
        } else {
2500
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2501
            $ret = false;
2502
        }
2503
2504
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2505
        $this->CLI_deleteProcessesMarkedDeleted();
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...rocessesMarkedDeleted() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.
Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2506
2507
        //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2508
2509
        return $ret;
2510
    }
2511
2512
    /**
2513
     * Release a process and the required resources
2514
     *
2515
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2516
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2517
     * @return boolean
2518
     */
2519
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2520
    {
2521
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2522
2523
        if (!is_array($releaseIds)) {
2524
            $releaseIds = [$releaseIds];
2525
        }
2526
2527
        if (!(count($releaseIds) > 0)) {
2528
            return false;   //nothing to release
2529
        }
2530
2531
        if (!$withinLock) {
2532
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2533
        }
2534
2535
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2536
        // this ensures that a single process can't mess up the entire process table
2537
2538
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2539
2540
        $queryBuilder
2541
        ->update('tx_crawler_queue', 'q')
2542
        ->where(
2543
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2544
        )
2545
        ->set('q.process_scheduled', 0)
2546
        ->set('q.process_id', '')
2547
        ->execute();
2548
2549
        // FIXME: Not entirely sure that this is equivalent to the previous version
2550
        $queryBuilder->resetQueryPart('set');
2551
2552
        $queryBuilder
2553
            ->update('tx_crawler_process')
2554
            ->where(
2555
                $queryBuilder->expr()->eq('active', 0),
2556
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2557
            )
2558
            ->set('system_process_id', 0)
2559
            ->execute();
2560
        // previous version for reference
2561
        /*
2562
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2563
            'tx_crawler_process',
2564
            'active=0 AND deleted=0
2565
            AND NOT EXISTS (
2566
                SELECT * FROM tx_crawler_queue
2567
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2568
                AND tx_crawler_queue.exec_time = 0
2569
            )',
2570
            [
2571
                'deleted' => '1',
2572
                'system_process_id' => 0
2573
            ]
2574
        );*/
2575
        // mark all requested processes as non-active
2576
        $queryBuilder
2577
            ->update('tx_crawler_process')
2578
            ->where(
2579
                'NOT EXISTS (
2580
                SELECT * FROM tx_crawler_queue
2581
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2582
                    AND tx_crawler_queue.exec_time = 0
2583
                )',
2584
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2585
                $queryBuilder->expr()->eq('deleted', 0)
2586
            )
2587
            ->set('active', 0)
2588
            ->execute();
2589
        $queryBuilder->resetQueryPart('set');
2590
        $queryBuilder
2591
            ->update('tx_crawler_queue')
2592
            ->where(
2593
                $queryBuilder->expr()->eq('exec_time', 0),
2594
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2595
            )
2596
            ->set('process_scheduled', 0)
2597
            ->set('process_id', '')
2598
            ->execute();
2599
2600
        if (!$withinLock) {
2601
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2602
        }
2603
2604
        return true;
2605
    }
2606
2607
    /**
2608
     * Delete processes marked as deleted
2609
     *
2610
     * @return void
2611
     *
2612
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
2613
     * Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()
2614
     */
2615
    public function CLI_deleteProcessesMarkedDeleted()
2616
    {
2617
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2618
        $queryBuilder
2619
            ->delete('tx_crawler_process')
2620
            ->where('deleted = 1')
2621
            ->execute();
2622
    }
2623
2624
    /**
2625
     * Check if there are still resources left for the process with the given id
2626
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2627
     *
2628
     * @param  string  identification string for the process
2629
     * @return boolean determines if the process is still active / has resources
2630
     *
2631
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2632
     */
2633
    public function CLI_checkIfProcessIsActive($pid)
2634
    {
2635
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2636
        $ret = false;
2637
2638
        $statement = $queryBuilder
2639
            ->from('tx_crawler_process')
2640
            ->select('active')
2641
            ->where(
2642
                $queryBuilder->expr()->eq('process_id', intval($pid))
2643
            )
2644
            ->orderBy('ttl')
2645
            ->execute();
2646
2647
        if ($row = $statement->fetch(0)) {
2648
            $ret = intVal($row['active']) == 1;
2649
        }
2650
2651
        return $ret;
2652
    }
2653
2654
    /**
2655
     * Create a unique Id for the current process
2656
     *
2657
     * @return string  the ID
2658
     */
2659
    public function CLI_buildProcessId()
2660
    {
2661
        if (!$this->processID) {
2662
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2663
        }
2664
        return $this->processID;
2665
    }
2666
2667
    /**
2668
     * @param bool $get_as_float
2669
     *
2670
     * @return mixed
2671
     */
2672
    protected function microtime($get_as_float = false)
2673
    {
2674
        return microtime($get_as_float);
2675
    }
2676
2677
    /**
2678
     * Prints a message to the stdout (only if debug-mode is enabled)
2679
     *
2680
     * @param  string $msg  the message
2681
     */
2682
    public function CLI_debug($msg)
2683
    {
2684
        if (intval($this->extensionSettings['processDebug'])) {
2685
            echo $msg . "\n";
2686
            flush();
2687
        }
2688
    }
2689
2690
    /**
2691
     * Get URL content by making direct request to TYPO3.
2692
     *
2693
     * @param  string $url          Page URL
2694
     * @param  int    $crawlerId    Crawler-ID
2695
     * @return array
2696
     */
2697
    protected function sendDirectRequest($url, $crawlerId)
2698
    {
2699
        $parsedUrl = parse_url($url);
2700
        if (!is_array($parsedUrl)) {
2701
            return [];
2702
        }
2703
2704
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2705
2706
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2707
        $cmd .= ' ';
2708
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2709
        $cmd .= ' ';
2710
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2711
        $cmd .= ' ';
2712
        $cmd .= escapeshellarg($url);
2713
        $cmd .= ' ';
2714
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2715
2716
        $startTime = microtime(true);
2717
        $content = $this->executeShellCommand($cmd);
2718
        $this->log($url . ' ' . (microtime(true) - $startTime));
2719
2720
        $result = [
2721
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2722
            'headers' => '',
2723
            'content' => $content
2724
        ];
2725
2726
        return $result;
2727
    }
2728
2729
    /**
2730
     * Cleans up entries that stayed for too long in the queue. These are:
2731
     * - processed entries that are over 1.5 days in age
2732
     * - scheduled entries that are over 7 days old
2733
     *
2734
     * @return void
2735
     */
2736
    public function cleanUpOldQueueEntries()
2737
    {
2738
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2739
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2740
2741
        $now = time();
2742
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2743
        $this->flushQueue($condition);
2744
    }
2745
2746
    /**
2747
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2748
     *
2749
     * @param int $id
2750
     * @param int $typeNum
2751
     *
2752
     * @return void
2753
     */
2754
    protected function initTSFE($id = 1, $typeNum = 0)
2755
    {
2756
        EidUtility::initTCA();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Utility\EidUtility::initTCA() has been deprecated with message: since TYPO3 v9.4, will be removed in TYPO3 v10.0. Is not needed anymore within eID scripts as TCA is now available at any time

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2757
        if (!is_object($GLOBALS['TT'])) {
2758
            $GLOBALS['TT'] = new TimeTracker(false);
2759
            $GLOBALS['TT']->start();
2760
        }
2761
2762
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2763
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2764
        $GLOBALS['TSFE']->sys_page->init(true);
2765
        $GLOBALS['TSFE']->initFEuser();
2766
        $GLOBALS['TSFE']->determineId();
2767
        $GLOBALS['TSFE']->initTemplate();
2768
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2769
        $GLOBALS['TSFE']->getConfigArray();
2770
    }
2771
2772
    /**
2773
     * Returns a md5 hash generated from a serialized configuration array.
2774
     *
2775
     * @param array $configuration
2776
     *
2777
     * @return string
2778
     */
2779
    protected function getConfigurationHash(array $configuration) {
2780
        unset($configuration['paramExpanded']);
2781
        unset($configuration['URLs']);
2782
        return md5(serialize($configuration));
2783
    }
2784
2785
    /**
2786
     * Check whether the Crawling Protocol should be http or https
2787
     *
2788
     * @param $crawlerConfiguration
2789
     * @param $pageConfiguration
2790
     *
2791
     * @return bool
2792
     */
2793
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2794
        switch($crawlerConfiguration) {
2795
            case -1:
2796
                return false;
2797
            case 0:
2798
                return $pageConfiguration;
2799
            case 1:
2800
                return true;
2801
            default:
2802
                return false;
2803
        }
2804
    }
2805
}
2806