Completed
Push — master ( 1b04e6...82e336 )
by Tomas Norre
08:07
created

Classes/Controller/CrawlerController.php (3 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\IconUtility;
35
use AOE\Crawler\Utility\SignalSlotUtility;
36
use TYPO3\CMS\Backend\Utility\BackendUtility;
37
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Database\DatabaseConnection;
40
use TYPO3\CMS\Core\Log\LogLevel;
41
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
42
use TYPO3\CMS\Core\Utility\DebugUtility;
43
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
44
use TYPO3\CMS\Core\Utility\GeneralUtility;
45
use TYPO3\CMS\Core\Utility\MathUtility;
46
use TYPO3\CMS\Extbase\Object\ObjectManager;
47
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
48
use TYPO3\CMS\Frontend\Page\PageGenerator;
49
use TYPO3\CMS\Frontend\Page\PageRepository;
50
use TYPO3\CMS\Frontend\Utility\EidUtility;
51
use TYPO3\CMS\Lang\LanguageService;
52
53
/**
54
 * Class CrawlerController
55
 *
56
 * @package AOE\Crawler\Controller
57
 */
58
class CrawlerController
59
{
60
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
61
    const CLI_STATUS_REMAIN = 1; //queue not empty
62
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
63
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
64
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
65
66
    /**
67
     * @var integer
68
     */
69
    public $setID = 0;
70
71
    /**
72
     * @var string
73
     */
74
    public $processID = '';
75
76
    /**
77
     * One hour is max stalled time for the CLI
78
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
79
     *
80
     * @var integer
81
     */
82
    public $max_CLI_exec_time = 3600;
83
84
    /**
85
     * @var array
86
     */
87
    public $duplicateTrack = [];
88
89
    /**
90
     * @var array
91
     */
92
    public $downloadUrls = [];
93
94
    /**
95
     * @var array
96
     */
97
    public $incomingProcInstructions = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $incomingConfigurationSelection = [];
103
104
    /**
105
     * @var bool
106
     */
107
    public $registerQueueEntriesInternallyOnly = false;
108
109
    /**
110
     * @var array
111
     */
112
    public $queueEntries = [];
113
114
    /**
115
     * @var array
116
     */
117
    public $urlList = [];
118
119
    /**
120
     * @var boolean
121
     */
122
    public $debugMode = false;
123
124
    /**
125
     * @var array
126
     */
127
    public $extensionSettings = [];
128
129
    /**
130
     * Mount Point
131
     *
132
     * @var boolean
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var DatabaseConnection
150
     */
151
    private $db;
152
153
    /**
154
     * @var BackendUserAuthentication
155
     */
156
    private $backendUser;
157
158
    /**
159
     * @var integer
160
     */
161
    private $scheduledTime = 0;
162
163
    /**
164
     * @var integer
165
     */
166
    private $reqMinute = 0;
167
168
    /**
169
     * @var bool
170
     */
171
    private $submitCrawlUrls = false;
172
173
    /**
174
     * @var bool
175
     */
176
    private $downloadCrawlUrls = false;
177
178
    /**
179
     * @var QueueRepository
180
     */
181
    protected  $queueRepository;
182
183
    /**
184
     * Method to set the accessMode can be gui, cli or cli_im
185
     *
186
     * @return string
187
     */
188 1
    public function getAccessMode()
189
    {
190 1
        return $this->accessMode;
191
    }
192
193
    /**
194
     * @param string $accessMode
195
     */
196 1
    public function setAccessMode($accessMode)
197
    {
198 1
        $this->accessMode = $accessMode;
199 1
    }
200
201
    /**
202
     * Set disabled status to prevent processes from being processed
203
     *
204
     * @param  bool $disabled (optional, defaults to true)
205
     * @return void
206
     */
207 3
    public function setDisabled($disabled = true)
208
    {
209 3
        if ($disabled) {
210 2
            GeneralUtility::writeFile($this->processFilename, '');
211
        } else {
212 1
            if (is_file($this->processFilename)) {
213 1
                unlink($this->processFilename);
214
            }
215
        }
216 3
    }
217
218
    /**
219
     * Get disable status
220
     *
221
     * @return bool true if disabled
222
     */
223 3
    public function getDisabled()
224
    {
225 3
        if (is_file($this->processFilename)) {
226 2
            return true;
227
        } else {
228 1
            return false;
229
        }
230
    }
231
232
    /**
233
     * @param string $filenameWithPath
234
     *
235
     * @return void
236
     */
237 4
    public function setProcessFilename($filenameWithPath)
238
    {
239 4
        $this->processFilename = $filenameWithPath;
240 4
    }
241
242
    /**
243
     * @return string
244
     */
245 1
    public function getProcessFilename()
246
    {
247 1
        return $this->processFilename;
248
    }
249
250
    /************************************
251
     *
252
     * Getting URLs based on Page TSconfig
253
     *
254
     ************************************/
255
256 28
    public function __construct()
257
    {
258 28
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
259 28
        $this->queueRepository = $objectManager->get(QueueRepository::class);
260
261 28
        $this->db = $GLOBALS['TYPO3_DB'];
262 28
        $this->backendUser = $GLOBALS['BE_USER'];
263 28
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
264
265 28
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
266 28
        $settings = is_array($settings) ? $settings : [];
267
268
        // read ext_em_conf_template settings and set
269 28
        $this->setExtensionSettings($settings);
270
271
        // set defaults:
272 28
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
273 21
            $this->extensionSettings['countInARun'] = 100;
274
        }
275
276 28
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
277 28
    }
278
279
    /**
280
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
281
     *
282
     * @param array $extensionSettings
283
     * @return void
284
     */
285 37
    public function setExtensionSettings(array $extensionSettings)
286
    {
287 37
        $this->extensionSettings = $extensionSettings;
288 37
    }
289
290
    /**
291
     * Check if the given page should be crawled
292
     *
293
     * @param array $pageRow
294
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
295
     */
296 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
297
    {
298 10
        $skipPage = false;
299 10
        $skipMessage = 'Skipped'; // message will be overwritten later
300
301
        // if page is hidden
302 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
303 10
            if ($pageRow['hidden']) {
304 1
                $skipPage = true;
305 1
                $skipMessage = 'Because page is hidden';
306
            }
307
        }
308
309 10
        if (!$skipPage) {
310 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
311 3
                $skipPage = true;
312 3
                $skipMessage = 'Because doktype is not allowed';
313
            }
314
        }
315
316 10
        if (!$skipPage) {
317 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
318 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
319 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
320 1
                        $skipPage = true;
321 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
322 1
                        break;
323
                    }
324
                }
325
            }
326
        }
327
328 10
        if (!$skipPage) {
329
            // veto hook
330 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
331
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
332
                    $params = [
333
                        'pageRow' => $pageRow
334
                    ];
335
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
336
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
337
                    if ($veto !== false) {
338
                        $skipPage = true;
339
                        if (is_string($veto)) {
340
                            $skipMessage = $veto;
341
                        } else {
342
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
343
                        }
344
                        // no need to execute other hooks if a previous one return a veto
345
                        break;
346
                    }
347
                }
348
            }
349
        }
350
351 10
        return $skipPage ? $skipMessage : false;
352
    }
353
354
    /**
355
     * Wrapper method for getUrlsForPageId()
356
     * It returns an array of configurations and no urls!
357
     *
358
     * @param array $pageRow Page record with at least dok-type and uid columns.
359
     * @param string $skipMessage
360
     * @return array
361
     * @see getUrlsForPageId()
362
     */
363 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
364
    {
365 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
366
367 6
        if ($message === false) {
368 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
369 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
370 5
            $skipMessage = '';
371
        } else {
372 1
            $skipMessage = $message;
373 1
            $res = [];
374
        }
375
376 6
        return $res;
377
    }
378
379
    /**
380
     * This method is used to count if there are ANY unprocessed queue entries
381
     * of a given page_id and the configuration which matches a given hash.
382
     * If there if none, we can skip an inner detail check
383
     *
384
     * @param  int $uid
385
     * @param  string $configurationHash
386
     * @return boolean
387
     */
388 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
389
    {
390 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
391 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
392 7
        $row = $this->db->sql_fetch_assoc($res);
393
394 7
        return ($row['anz'] == 0);
395
    }
396
397
    /**
398
     * Creates a list of URLs from input array (and submits them to queue if asked for)
399
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
400
     *
401
     * @param    array        Information about URLs from pageRow to crawl.
402
     * @param    array        Page row
403
     * @param    integer        Unix time to schedule indexing to, typically time()
404
     * @param    integer        Number of requests per minute (creates the interleave between requests)
405
     * @param    boolean        If set, submits the URLs to queue
406
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
407
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
408
     * @param    array        Array which will be filled with URLS for download if flag is set.
409
     * @param    array        Array of processing instructions
410
     * @return    string        List of URLs (meant for display in backend module)
411
     *
412
     */
413 4
    public function urlListFromUrlArray(
414
    array $vv,
415
    array $pageRow,
416
    $scheduledTime,
417
    $reqMinute,
418
    $submitCrawlUrls,
419
    $downloadCrawlUrls,
420
    array &$duplicateTrack,
421
    array &$downloadUrls,
422
    array $incomingProcInstructions
423
    ) {
424 4
        $urlList = '';
425
        // realurl support (thanks to Ingo Renner)
426 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
427
428
            /** @var tx_realurl $urlObj */
429
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
430
431
            if (!empty($vv['subCfg']['baseUrl'])) {
432
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
433
                $host = strtolower($urlParts['host']);
434
                $urlObj->host = $host;
435
436
                // First pass, finding configuration OR pointer string:
437
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
438
439
                // If it turned out to be a string pointer, then look up the real config:
440
                if (is_string($urlObj->extConf)) {
441
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
442
                }
443
            }
444
445
            if (!$GLOBALS['TSFE']->sys_page) {
446
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
447
            }
448
            if (!$GLOBALS['TSFE']->csConvObj) {
449
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
450
            }
451
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
452
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
453
            }
454
        }
455
456 4
        if (is_array($vv['URLs'])) {
457 4
            $configurationHash = $this->getConfigurationHash($vv);
458 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
459
460 4
            foreach ($vv['URLs'] as $urlQuery) {
461 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
462
463
                    // Calculate cHash:
464 4
                    if ($vv['subCfg']['cHash']) {
465
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
466
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
467
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
468
                    }
469
470
                    // Create key by which to determine unique-ness:
471 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
472
473
                    // realurl support (thanks to Ingo Renner)
474 4
                    $urlQuery = 'index.php' . $urlQuery;
475 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
476
                        $params = [
477
                            'LD' => [
478
                                'totalURL' => $urlQuery
479
                            ],
480
                            'TCEmainHook' => true
481
                        ];
482
                        $urlObj->encodeSpURL($params);
483
                        $urlQuery = $params['LD']['totalURL'];
484
                    }
485
486
                    // Scheduled time:
487 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
488 4
                    $schTime = floor($schTime / 60) * 60;
489
490 4
                    if (isset($duplicateTrack[$uKey])) {
491
492
                        //if the url key is registered just display it and do not resubmit is
493
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
494
                    } else {
495 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
496 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
497
498 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
499
500
                        // Submit for crawling!
501 4
                        if ($submitCrawlUrls) {
502 4
                            $added = $this->addUrl(
503 4
                            $pageRow['uid'],
504 4
                            $theUrl,
505 4
                            $vv['subCfg'],
506 4
                            $scheduledTime,
507 4
                            $configurationHash,
508 4
                            $skipInnerCheck
509
                            );
510 4
                            if ($added === false) {
511 4
                                $urlList .= ' (Url already existed)';
512
                            }
513
                        } elseif ($downloadCrawlUrls) {
514
                            $downloadUrls[$theUrl] = $theUrl;
515
                        }
516
517 4
                        $urlList .= '<br />';
518
                    }
519 4
                    $duplicateTrack[$uKey] = true;
520
                }
521
            }
522
        } else {
523
            $urlList = 'ERROR - no URL generated';
524
        }
525
526 4
        return $urlList;
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 5
        if (empty($incomingProcInstructions)) {
539 1
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 4
                return true;
545
            }
546
        }
547 2
    }
548
549 4
    public function getPageTSconfigForId($id)
550
    {
551 4
        if (!$this->MP) {
552 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            list(, $mountPointId) = explode('-', $this->MP);
555
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
556
        }
557
558
        // Call a hook to alter configuration
559 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
560
            $params = [
561
                'pageId' => $id,
562
                'pageTSConfig' => &$pageTSconfig
563
            ];
564
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
565
                GeneralUtility::callUserFunction($userFunc, $params, $this);
566
            }
567
        }
568
569 4
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * And no urls!
575
     *
576
     * @param integer $id Page ID
577
     * @param bool $forceSsl Use https
578
     * @return array
579
     */
580 4
    protected function getUrlsForPageId($id, $forceSsl = false)
581
    {
582
583
        /**
584
         * Get configuration from tsConfig
585
         */
586
587
        // Get page TSconfig for page ID:
588 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
589
590 4
        $res = [];
591
592 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
593 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
594
595 3
            if (is_array($crawlerCfg['paramSets.'])) {
596 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
597 3
                    if (is_array($values)) {
598 3
                        $key = str_replace('.', '', $key);
599
                        // Sub configuration for a single configuration string:
600 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
601 3
                        $subCfg['key'] = $key;
602
603 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
604 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
605
                        }
606 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
607
608
                        // process configuration if it is not page-specific or if the specific page is the current page:
609 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
610
611
                                // add trailing slash if not present
612 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
613
                                $subCfg['baseUrl'] .= '/';
614
                            }
615
616
                            // Explode, process etc.:
617 3
                            $res[$key] = [];
618 3
                            $res[$key]['subCfg'] = $subCfg;
619 3
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
620 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
621 3
                            $res[$key]['origin'] = 'pagets';
622
623
                            // recognize MP value
624 3
                            if (!$this->MP) {
625 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
626
                            } else {
627 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
628
                            }
629
                        }
630
                    }
631
                }
632
            }
633
        }
634
635
        /**
636
         * Get configuration from tx_crawler_configuration records
637
         */
638
639
        // get records along the rootline
640 4
        $rootLine = BackendUtility::BEgetRootLine($id);
641
642 4
        foreach ($rootLine as $page) {
643 4
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
644 4
                'tx_crawler_configuration',
645 4
                'pid',
646 4
                intval($page['uid']),
647 4
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
648
            );
649
650 4
            if (is_array($configurationRecordsForCurrentPage)) {
651 1
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
652
653
                        // check access to the configuration record
654 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
655 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
656
657
                        // process configuration if it is not page-specific or if the specific page is the current page:
658 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
659 1
                            $key = $configurationRecord['name'];
660
661
                            // don't overwrite previously defined paramSets
662 1
                            if (!isset($res[$key])) {
663
664
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
665 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
666 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
667
668 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
669
670
                                $subCfg = [
671 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
672 1
                                    'procInstrParams.' => $TSparserObject->setup,
673 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
674 1
                                        $configurationRecord['base_url'],
675 1
                                        $configurationRecord['sys_domain_base_url'],
676 1
                                        $isCrawlingProtocolHttps
677
                                    ),
678 1
                                    'realurl' => $configurationRecord['realurl'],
679 1
                                    'cHash' => $configurationRecord['chash'],
680 1
                                    'userGroups' => $configurationRecord['fegroups'],
681 1
                                    'exclude' => $configurationRecord['exclude'],
682 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
683 1
                                    'key' => $key
684
                                ];
685
686
                                // add trailing slash if not present
687 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
688
                                    $subCfg['baseUrl'] .= '/';
689
                                }
690 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
691 1
                                    $res[$key] = [];
692 1
                                    $res[$key]['subCfg'] = $subCfg;
693 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
694 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
695 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
696 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
697
                                }
698
                            }
699
                        }
700
                    }
701
                }
702
            }
703
        }
704
705 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
706
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
707
                $params = [
708
                    'res' => &$res,
709
                ];
710
                GeneralUtility::callUserFunction($func, $params, $this);
711
            }
712
        }
713
714 4
        return $res;
715
    }
716
717
    /**
718
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
719
     *
720
     * @param string $baseUrl
721
     * @param integer $sysDomainUid
722
     * @param bool $ssl
723
     * @return string
724
     */
725 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
726
    {
727 4
        $sysDomainUid = intval($sysDomainUid);
728 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
729
730 4
        if ($sysDomainUid > 0) {
731 2
            $res = $this->db->exec_SELECTquery(
732 2
                '*',
733 2
                'sys_domain',
734 2
                'uid = ' . $sysDomainUid .
735 2
                BackendUtility::BEenableFields('sys_domain') .
736 2
                BackendUtility::deleteClause('sys_domain')
737
            );
738 2
            $row = $this->db->sql_fetch_assoc($res);
739 2
            if ($row['domainName'] != '') {
740 1
                return $urlScheme . '://' . $row['domainName'];
741
            }
742
        }
743 3
        return $baseUrl;
744
    }
745
746
    public function getConfigurationsForBranch($rootid, $depth)
747
    {
748
        $configurationsForBranch = [];
749
750
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
751
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
752
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
753
            if (is_array($sets)) {
754
                foreach ($sets as $key => $value) {
755
                    if (!is_array($value)) {
756
                        continue;
757
                    }
758
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
759
                }
760
            }
761
        }
762
        $pids = [];
763
        $rootLine = BackendUtility::BEgetRootLine($rootid);
764
        foreach ($rootLine as $node) {
765
            $pids[] = $node['uid'];
766
        }
767
        /* @var PageTreeView $tree */
768
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
769
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
770
        $tree->init('AND ' . $perms_clause);
771
        $tree->getTree($rootid, $depth, '');
772
        foreach ($tree->tree as $node) {
773
            $pids[] = $node['row']['uid'];
774
        }
775
776
        $res = $this->db->exec_SELECTquery(
777
            '*',
778
            'tx_crawler_configuration',
779
            'pid IN (' . implode(',', $pids) . ') ' .
780
            BackendUtility::BEenableFields('tx_crawler_configuration') .
781
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
782
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
783
        );
784
785
        while ($row = $this->db->sql_fetch_assoc($res)) {
786
            $configurationsForBranch[] = $row['name'];
787
        }
788
        $this->db->sql_free_result($res);
789
        return $configurationsForBranch;
790
    }
791
792
    /**
793
     * Check if a user has access to an item
794
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
795
     *
796
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
797
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
798
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
799
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
800
     */
801 3
    public function hasGroupAccess($groupList, $accessList)
802
    {
803 3
        if (empty($accessList)) {
804 1
            return true;
805
        }
806 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
807 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
808 2
                return true;
809
            }
810
        }
811 1
        return false;
812
    }
813
814
    /**
815
     * Parse GET vars of input Query into array with key=>value pairs
816
     *
817
     * @param string $inputQuery Input query string
818
     * @return array
819
     */
820 7
    public function parseParams($inputQuery)
821
    {
822
        // Extract all GET parameters into an ARRAY:
823 7
        $paramKeyValues = [];
824 7
        $GETparams = explode('&', $inputQuery);
825
826 7
        foreach ($GETparams as $paramAndValue) {
827 7
            list($p, $v) = explode('=', $paramAndValue, 2);
828 7
            if (strlen($p)) {
829 7
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
830
            }
831
        }
832
833 7
        return $paramKeyValues;
834
    }
835
836
    /**
837
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
838
     * Syntax of values:
839
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
840
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
841
     * - For each configuration part:
842
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
843
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
844
     *        _ENABLELANG:1 picks only original records without their language overlays
845
     *         - Default: Literal value
846
     *
847
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
848
     * @param integer $pid Current page ID
849
     * @return array
850
     */
851 4
    public function expandParameters($paramArray, $pid)
852
    {
853 4
        global $TCA;
854
855
        // Traverse parameter names:
856 4
        foreach ($paramArray as $p => $v) {
857 4
            $v = trim($v);
858
859
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
860 4
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
861
                // So, find the value inside brackets and reset the paramArray value as an array.
862 4
                $v = substr($v, 1, -1);
863 4
                $paramArray[$p] = [];
864
865
                // Explode parts and traverse them:
866 4
                $parts = explode('|', $v);
867 4
                foreach ($parts as $pV) {
868
869
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
870 4
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
871
872
                        // Swap if first is larger than last:
873
                        if ($reg[1] > $reg[2]) {
874
                            $temp = $reg[2];
875
                            $reg[2] = $reg[1];
876
                            $reg[1] = $temp;
877
                        }
878
879
                        // Traverse range, add values:
880
                        $runAwayBrake = 1000; // Limit to size of range!
881
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
882
                            $paramArray[$p][] = $a;
883
                            $runAwayBrake--;
884
                            if ($runAwayBrake <= 0) {
885
                                break;
886
                            }
887
                        }
888 4
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
889
890
                        // Parse parameters:
891
                        $subparts = GeneralUtility::trimExplode(';', $pV);
892
                        $subpartParams = [];
893
                        foreach ($subparts as $spV) {
894
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
895
                            $subpartParams[$pKey] = $pVal;
896
                        }
897
898
                        // Table exists:
899
                        if (isset($TCA[$subpartParams['_TABLE']])) {
900
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
901
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
902
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
903
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
904
905
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
906
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
907
                                $andWhereLanguage = '';
908
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
909
910
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
911
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
912
                                }
913
914
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
915
                                    $andWhereLanguage . $where;
916
917
                                $rows = $this->db->exec_SELECTgetRows(
918
                                    $fieldName,
919
                                    $subpartParams['_TABLE'] . $addTable,
920
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
921
                                    '',
922
                                    '',
923
                                    '',
924
                                    $fieldName
925
                                );
926
927
                                if (is_array($rows)) {
928
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
929
                                }
930
                            }
931
                        }
932
                    } else { // Just add value:
933 4
                        $paramArray[$p][] = $pV;
934
                    }
935
                    // Hook for processing own expandParameters place holder
936 4
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
937
                        $_params = [
938
                            'pObj' => &$this,
939
                            'paramArray' => &$paramArray,
940
                            'currentKey' => $p,
941
                            'currentValue' => $pV,
942
                            'pid' => $pid
943
                        ];
944
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
945 4
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
946
                        }
947
                    }
948
                }
949
950
                // Make unique set of values and sort array by key:
951 4
                $paramArray[$p] = array_unique($paramArray[$p]);
952 4
                ksort($paramArray);
953
            } else {
954
                // Set the literal value as only value in array:
955 4
                $paramArray[$p] = [$v];
956
            }
957
        }
958
959 4
        return $paramArray;
960
    }
961
962
    /**
963
     * Compiling URLs from parameter array (output of expandParameters())
964
     * The number of URLs will be the multiplication of the number of parameter values for each key
965
     *
966
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
967
     * @param array $urls URLs accumulated in this array (for recursion)
968
     * @return array
969
     */
970 7
    public function compileUrls($paramArray, $urls = [])
971
    {
972 7
        if (count($paramArray) && is_array($urls)) {
973
            // shift first off stack:
974 6
            reset($paramArray);
975 6
            $varName = key($paramArray);
976 6
            $valueSet = array_shift($paramArray);
977
978
            // Traverse value set:
979 6
            $newUrls = [];
980 6
            foreach ($urls as $url) {
981 5
                foreach ($valueSet as $val) {
982 5
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
983
984 5
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
985 5
                        break;
986
                    }
987
                }
988
            }
989 6
            $urls = $newUrls;
990 6
            $urls = $this->compileUrls($paramArray, $urls);
991
        }
992
993 7
        return $urls;
994
    }
995
996
    /************************************
997
     *
998
     * Crawler log
999
     *
1000
     ************************************/
1001
1002
    /**
1003
     * Return array of records from crawler queue for input page ID
1004
     *
1005
     * @param integer $id Page ID for which to look up log entries.
1006
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1007
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1008
     * @param boolean $doFullFlush
1009
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1010
     * @return array
1011
     */
1012 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1013
    {
1014
        switch ($filter) {
1015 4
            case 'pending':
1016
                $addWhere = ' AND exec_time=0';
1017
                break;
1018 4
            case 'finished':
1019
                $addWhere = ' AND exec_time>0';
1020
                break;
1021
            default:
1022 4
                $addWhere = '';
1023 4
                break;
1024
        }
1025
1026
        // FIXME: Write unit test that ensures that the right records are deleted.
1027 4
        if ($doFlush) {
1028 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1029 2
            return [];
1030
        } else {
1031 2
            return $this->db->exec_SELECTgetRows(
1032 2
                '*',
1033 2
                'tx_crawler_queue',
1034 2
                'page_id=' . intval($id) . $addWhere,
1035 2
                '',
1036 2
                'scheduled DESC',
1037 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1038
            );
1039
        }
1040
    }
1041
1042
    /**
1043
     * Return array of records from crawler queue for input set ID
1044
     *
1045
     * @param integer $set_id Set ID for which to look up log entries.
1046
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1047
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1048
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1049
     * @return array
1050
     */
1051 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1052
    {
1053
        // FIXME: Write Unit tests for Filters
1054
        switch ($filter) {
1055 6
            case 'pending':
1056 1
                $addWhere = ' AND exec_time=0';
1057 1
                break;
1058 5
            case 'finished':
1059 1
                $addWhere = ' AND exec_time>0';
1060 1
                break;
1061
            default:
1062 4
                $addWhere = '';
1063 4
                break;
1064
        }
1065
        // FIXME: Write unit test that ensures that the right records are deleted.
1066 6
        if ($doFlush) {
1067 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1068 4
            return [];
1069
        } else {
1070 2
            return $this->db->exec_SELECTgetRows(
1071 2
                '*',
1072 2
                'tx_crawler_queue',
1073 2
                'set_id=' . intval($set_id) . $addWhere,
1074 2
                '',
1075 2
                'scheduled DESC',
1076 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1077
            );
1078
        }
1079
    }
1080
1081
    /**
1082
     * Removes queue entries
1083
     *
1084
     * @param string $where SQL related filter for the entries which should be removed
1085
     * @return void
1086
     */
1087 10
    protected function flushQueue($where = '')
1088
    {
1089 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1090
1091 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1092
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1093
            if (is_array($groups)) {
1094
                foreach ($groups as $group) {
1095
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"'));
1096
                }
1097
            }
1098
        }
1099
1100 10
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
1101 10
    }
1102
1103
    /**
1104
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1105
     *
1106
     * @param integer $setId Set ID
1107
     * @param array $params Parameters to pass to call back function
1108
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1109
     * @param integer $page_id Page ID to attach it to
1110
     * @param integer $schedule Time at which to activate
1111
     * @return void
1112
     */
1113
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1114
    {
1115
        if (!is_array($params)) {
1116
            $params = [];
1117
        }
1118
        $params['_CALLBACKOBJ'] = $callBack;
1119
1120
        // Compile value array:
1121
        $fieldArray = [
1122
            'page_id' => intval($page_id),
1123
            'parameters' => serialize($params),
1124
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1125
            'exec_time' => 0,
1126
            'set_id' => intval($setId),
1127
            'result_data' => '',
1128
        ];
1129
1130
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1131
    }
1132
1133
    /************************************
1134
     *
1135
     * URL setting
1136
     *
1137
     ************************************/
1138
1139
    /**
1140
     * Setting a URL for crawling:
1141
     *
1142
     * @param integer $id Page ID
1143
     * @param string $url Complete URL
1144
     * @param array $subCfg Sub configuration array (from TS config)
1145
     * @param integer $tstamp Scheduled-time
1146
     * @param string $configurationHash (optional) configuration hash
1147
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1148
     * @return bool
1149
     */
1150 4
    public function addUrl(
1151
        $id,
1152
        $url,
1153
        array $subCfg,
1154
        $tstamp,
1155
        $configurationHash = '',
1156
        $skipInnerDuplicationCheck = false
1157
    ) {
1158 4
        $urlAdded = false;
1159 4
        $rows = [];
1160
1161
        // Creating parameters:
1162
        $parameters = [
1163 4
            'url' => $url
1164
        ];
1165
1166
        // fe user group simulation:
1167 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1168 4
        if ($uGs) {
1169
            $parameters['feUserGroupList'] = $uGs;
1170
        }
1171
1172
        // Setting processing instructions
1173 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1174 4
        if (is_array($subCfg['procInstrParams.'])) {
1175 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1176
        }
1177
1178
        // Possible TypoScript Template Parents
1179 4
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1180
1181
        // Compile value array:
1182 4
        $parameters_serialized = serialize($parameters);
1183
        $fieldArray = [
1184 4
            'page_id' => intval($id),
1185 4
            'parameters' => $parameters_serialized,
1186 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1187 4
            'configuration_hash' => $configurationHash,
1188 4
            'scheduled' => $tstamp,
1189 4
            'exec_time' => 0,
1190 4
            'set_id' => intval($this->setID),
1191 4
            'result_data' => '',
1192 4
            'configuration' => $subCfg['key'],
1193
        ];
1194
1195 4
        if ($this->registerQueueEntriesInternallyOnly) {
1196
            //the entries will only be registered and not stored to the database
1197
            $this->queueEntries[] = $fieldArray;
1198
        } else {
1199 4
            if (!$skipInnerDuplicationCheck) {
1200
                // check if there is already an equal entry
1201 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1202
            }
1203
1204 4
            if (count($rows) == 0) {
1205 4
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1206 4
                $uid = $this->db->sql_insert_id();
1207 4
                $rows[] = $uid;
1208 4
                $urlAdded = true;
1209 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1210
            } else {
1211 2
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1212
            }
1213
        }
1214
1215 4
        return $urlAdded;
1216
    }
1217
1218
    /**
1219
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1220
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1221
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1222
     *
1223
     * @param int $tstamp
1224
     * @param array $fieldArray
1225
     *
1226
     * @return array
1227
     */
1228 4
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1229
    {
1230 4
        $rows = [];
1231
1232 4
        $currentTime = $this->getCurrentTime();
1233
1234
        //if this entry is scheduled with "now"
1235 4
        if ($tstamp <= $currentTime) {
1236 1
            if ($this->extensionSettings['enableTimeslot']) {
1237 1
                $timeBegin = $currentTime - 100;
1238 1
                $timeEnd = $currentTime + 100;
1239 1
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1240
            } else {
1241 1
                $where = 'scheduled <= ' . $currentTime;
1242
            }
1243 3
        } elseif ($tstamp > $currentTime) {
1244
            //entry with a timestamp in the future need to have the same schedule time
1245 3
            $where = 'scheduled = ' . $tstamp ;
1246
        }
1247
1248 4
        if (!empty($where)) {
1249 4
            $result = $this->db->exec_SELECTgetRows(
1250 4
                'qid',
1251 4
                'tx_crawler_queue',
1252
                $where .
1253 4
                ' AND NOT exec_time' .
1254 4
                ' AND NOT process_id ' .
1255 4
                ' AND page_id=' . intval($fieldArray['page_id']) .
1256 4
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1257
            );
1258
1259 4
            if (is_array($result)) {
1260 4
                foreach ($result as $value) {
1261 2
                    $rows[] = $value['qid'];
1262
                }
1263
            }
1264
        }
1265
1266 4
        return $rows;
1267
    }
1268
1269
    /**
1270
     * Returns the current system time
1271
     *
1272
     * @return int
1273
     */
1274
    public function getCurrentTime()
1275
    {
1276
        return time();
1277
    }
1278
1279
    /************************************
1280
     *
1281
     * URL reading
1282
     *
1283
     ************************************/
1284
1285
    /**
1286
     * Read URL for single queue entry
1287
     *
1288
     * @param integer $queueId
1289
     * @param boolean $force If set, will process even if exec_time has been set!
1290
     * @return integer
1291
     */
1292
    public function readUrl($queueId, $force = false)
1293
    {
1294
        $ret = 0;
1295
        if ($this->debugMode) {
1296
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1297
        }
1298
        // Get entry:
1299
        list($queueRec) = $this->db->exec_SELECTgetRows(
1300
            '*',
1301
            'tx_crawler_queue',
1302
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1303
        );
1304
1305
        if (!is_array($queueRec)) {
1306
            return;
1307
        }
1308
1309
        $parameters = unserialize($queueRec['parameters']);
1310
        if ($parameters['rootTemplatePid']) {
1311
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1312
        } else {
1313
            GeneralUtility::sysLog(
1314
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1315
                'crawler',
1316
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1317
            );
1318
        }
1319
1320
        SignalSlotUtility::emitSignal(
1321
            __CLASS__,
1322
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1323
            [$queueId, &$queueRec]
1324
        );
1325
1326
        // Set exec_time to lock record:
1327
        $field_array = ['exec_time' => $this->getCurrentTime()];
1328
1329
        if (isset($this->processID)) {
1330
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1331
            $field_array['process_id_completed'] = $this->processID;
1332
        }
1333
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1334
1335
        $result = $this->readUrl_exec($queueRec);
1336
        $resultData = unserialize($result['content']);
1337
1338
        //atm there's no need to point to specific pollable extensions
1339
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1340
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1341
                // only check the success value if the instruction is runnig
1342
                // it is important to name the pollSuccess key same as the procInstructions key
1343
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1344
                    $pollable,
1345
                        $resultData['parameters']['procInstructions']
1346
                )
1347
                ) {
1348
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1349
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1350
                    }
1351
                }
1352
            }
1353
        }
1354
1355
        // Set result in log which also denotes the end of the processing of this entry.
1356
        $field_array = ['result_data' => serialize($result)];
1357
1358
        SignalSlotUtility::emitSignal(
1359
            __CLASS__,
1360
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1361
            [$queueId, &$field_array]
1362
        );
1363
1364
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1365
1366
        if ($this->debugMode) {
1367
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1368
        }
1369
1370
        return $ret;
1371
    }
1372
1373
    /**
1374
     * Read URL for not-yet-inserted log-entry
1375
     *
1376
     * @param array $field_array Queue field array,
1377
     *
1378
     * @return string
1379
     */
1380
    public function readUrlFromArray($field_array)
1381
    {
1382
1383
            // Set exec_time to lock record:
1384
        $field_array['exec_time'] = $this->getCurrentTime();
1385
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1386
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1387
1388
        $result = $this->readUrl_exec($field_array);
1389
1390
        // Set result in log which also denotes the end of the processing of this entry.
1391
        $field_array = ['result_data' => serialize($result)];
1392
1393
        SignalSlotUtility::emitSignal(
1394
            __CLASS__,
1395
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1396
            [$queueId, &$field_array]
1397
        );
1398
1399
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1400
1401
        return $result;
1402
    }
1403
1404
    /**
1405
     * Read URL for a queue record
1406
     *
1407
     * @param array $queueRec Queue record
1408
     * @return string
1409
     */
1410
    public function readUrl_exec($queueRec)
1411
    {
1412
        // Decode parameters:
1413
        $parameters = unserialize($queueRec['parameters']);
1414
        $result = 'ERROR';
1415
        if (is_array($parameters)) {
1416
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1417
                $objRef = $parameters['_CALLBACKOBJ'];
1418
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1419
                if (is_object($callBackObj)) {
1420
                    unset($parameters['_CALLBACKOBJ']);
1421
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1422
                } else {
1423
                    $result = ['content' => 'No object: ' . $objRef];
1424
                }
1425
            } else { // Regular FE request:
1426
1427
                // Prepare:
1428
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1429
1430
                // Get result:
1431
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1432
1433
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1434
            }
1435
        }
1436
1437
        return $result;
1438
    }
1439
1440
    /**
1441
     * Gets the content of a URL.
1442
     *
1443
     * @param string $originalUrl URL to read
1444
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1445
     * @param integer $timeout Timeout time
1446
     * @param integer $recursion Recursion limiter for 302 redirects
1447
     * @return array
1448
     */
1449 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1450
    {
1451 2
        if (!$recursion) {
1452
            return false;
1453
        }
1454
1455
        // Parse URL, checking for scheme:
1456 2
        $url = parse_url($originalUrl);
1457
1458 2
        if ($url === false) {
1459
            if (TYPO3_DLOG) {
1460
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1461
            }
1462
            return false;
1463
        }
1464
1465 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1466
            if (TYPO3_DLOG) {
1467
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1468
            }
1469
            return false;
1470
        }
1471
1472
        // direct request
1473 2
        if ($this->extensionSettings['makeDirectRequests']) {
1474 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1475 2
            return $result;
1476
        }
1477
1478
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1479
1480
        // thanks to Pierrick Caillon for adding proxy support
1481
        $rurl = $url;
1482
1483
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1484
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1485
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1486
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1487
        }
1488
1489
        $host = $rurl['host'];
1490
1491
        if ($url['scheme'] == 'https') {
1492
            $host = 'ssl://' . $host;
1493
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1494
        } else {
1495
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1496
        }
1497
1498
        $startTime = microtime(true);
1499
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1500
1501
        if (!$fp) {
1502
            if (TYPO3_DLOG) {
1503
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1504
            }
1505
            return false;
1506
        } else {
1507
            // Request message:
1508
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1509
            fputs($fp, $msg);
1510
1511
            // Read response:
1512
            $d = $this->getHttpResponseFromStream($fp);
1513
            fclose($fp);
1514
1515
            $time = microtime(true) - $startTime;
1516
            $this->log($originalUrl . ' ' . $time);
1517
1518
            // Implode content and headers:
1519
            $result = [
1520
                'request' => $msg,
1521
                'headers' => implode('', $d['headers']),
1522
                'content' => implode('', (array)$d['content'])
1523
            ];
1524
1525
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1526
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1525 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1527
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1525 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1528
1529
                if (is_array($newRequestUrl)) {
1530
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1531
                } else {
1532
                    if (TYPO3_DLOG) {
1533
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1534
                    }
1535
                    return false;
1536
                }
1537
            }
1538
1539
            return $result;
1540
        }
1541
    }
1542
1543
    /**
1544
     * Gets the base path of the website frontend.
1545
     * (e.g. if you call http://mydomain.com/cms/index.php in
1546
     * the browser the base path is "/cms/")
1547
     *
1548
     * @return string Base path of the website frontend
1549
     */
1550
    protected function getFrontendBasePath()
1551
    {
1552
        $frontendBasePath = '/';
1553
1554
        // Get the path from the extension settings:
1555
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1556
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1557
            // If empty, try to use config.absRefPrefix:
1558
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1559
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1560
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1561
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1562
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1563
        }
1564
1565
        // Base path must be '/<pathSegements>/':
1566
        if ($frontendBasePath != '/') {
1567
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1568
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1569
        }
1570
1571
        return $frontendBasePath;
1572
    }
1573
1574
    /**
1575
     * Executes a shell command and returns the outputted result.
1576
     *
1577
     * @param string $command Shell command to be executed
1578
     * @return string Outputted result of the command execution
1579
     */
1580
    protected function executeShellCommand($command)
1581
    {
1582
        $result = shell_exec($command);
1583
        return $result;
1584
    }
1585
1586
    /**
1587
     * Reads HTTP response from the given stream.
1588
     *
1589
     * @param  resource $streamPointer  Pointer to connection stream.
1590
     * @return array                    Associative array with the following items:
1591
     *                                  headers <array> Response headers sent by server.
1592
     *                                  content <array> Content, with each line as an array item.
1593
     */
1594 1
    protected function getHttpResponseFromStream($streamPointer)
1595
    {
1596 1
        $response = ['headers' => [], 'content' => []];
1597
1598 1
        if (is_resource($streamPointer)) {
1599
            // read headers
1600 1
            while ($line = fgets($streamPointer, '2048')) {
1601 1
                $line = trim($line);
1602 1
                if ($line !== '') {
1603 1
                    $response['headers'][] = $line;
1604
                } else {
1605 1
                    break;
1606
                }
1607
            }
1608
1609
            // read content
1610 1
            while ($line = fgets($streamPointer, '2048')) {
1611 1
                $response['content'][] = $line;
1612
            }
1613
        }
1614
1615 1
        return $response;
1616
    }
1617
1618
    /**
1619
     * @param message
1620
     */
1621 2
    protected function log($message)
1622
    {
1623 2
        if (!empty($this->extensionSettings['logFileName'])) {
1624
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1625
            if (!$fileResult) {
1626
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1627
            }
1628
        }
1629 2
    }
1630
1631
    /**
1632
     * Builds HTTP request headers.
1633
     *
1634
     * @param array $url
1635
     * @param string $crawlerId
1636
     *
1637
     * @return array
1638
     */
1639 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1640
    {
1641 6
        $reqHeaders = [];
1642 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1643 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1644 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1645 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1646
        }
1647 6
        $reqHeaders[] = 'Connection: close';
1648 6
        if ($url['user'] != '') {
1649 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1650
        }
1651 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1652 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1653 6
        return $reqHeaders;
1654
    }
1655
1656
    /**
1657
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1658
     *
1659
     * @param array $headers HTTP Header
1660
     * @param string $user HTTP Auth. User
1661
     * @param string $pass HTTP Auth. Password
1662
     * @return bool|string
1663
     */
1664 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1665
    {
1666 12
        $header = [];
1667 12
        if (!is_array($headers)) {
1668 1
            return false;
1669
        }
1670 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1671 2
            return false;
1672
        }
1673
1674 9
        foreach ($headers as $hl) {
1675 9
            $tmp = explode(": ", $hl);
1676 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1677 9
            if (trim($tmp[0]) == 'Location') {
1678 9
                break;
1679
            }
1680
        }
1681 9
        if (!array_key_exists('Location', $header)) {
1682 3
            return false;
1683
        }
1684
1685 6
        if ($user != '') {
1686 3
            if (!($tmp = parse_url($header['Location']))) {
1687 1
                return false;
1688
            }
1689 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1690 2
            if ($tmp['query'] != '') {
1691 2
                $newUrl .= '?' . $tmp['query'];
1692
            }
1693
        } else {
1694 3
            $newUrl = $header['Location'];
1695
        }
1696 5
        return $newUrl;
1697
    }
1698
1699
    /**************************
1700
     *
1701
     * tslib_fe hooks:
1702
     *
1703
     **************************/
1704
1705
    /**
1706
     * Initialization hook (called after database connection)
1707
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1708
     *
1709
     * @param array $params Parameters from frontend
1710
     * @param object $ref TSFE object (reference under PHP5)
1711
     * @return void
1712
     *
1713
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1714
     * FIXME: I think this can be removed. (TNM)
1715
     */
1716
    public function fe_init(&$params, $ref)
0 ignored issues
show
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1717
    {
1718
        // Authenticate crawler request:
1719
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1720
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1721
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1722
1723
            // If a crawler record was found and hash was matching, set it up:
1724
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1725
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1726
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1727
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1728
            } else {
1729
                die('No crawler entry found!');
1730
            }
1731
        }
1732
    }
1733
1734
    /*****************************
1735
     *
1736
     * Compiling URLs to crawl - tools
1737
     *
1738
     *****************************/
1739
1740
    /**
1741
     * @param integer $id Root page id to start from.
1742
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1743
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1744
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1745
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1746
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1747
     * @param array $incomingProcInstructions Array of processing instructions
1748
     * @param array $configurationSelection Array of configuration keys
1749
     * @return string
1750
     */
1751
    public function getPageTreeAndUrls(
1752
        $id,
1753
        $depth,
1754
        $scheduledTime,
1755
        $reqMinute,
1756
        $submitCrawlUrls,
1757
        $downloadCrawlUrls,
1758
        array $incomingProcInstructions,
1759
        array $configurationSelection
1760
    ) {
1761
        global $BACK_PATH;
1762
        global $LANG;
1763
        if (!is_object($LANG)) {
1764
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1765
            $LANG->init(0);
1766
        }
1767
        $this->scheduledTime = $scheduledTime;
1768
        $this->reqMinute = $reqMinute;
1769
        $this->submitCrawlUrls = $submitCrawlUrls;
1770
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1771
        $this->incomingProcInstructions = $incomingProcInstructions;
1772
        $this->incomingConfigurationSelection = $configurationSelection;
1773
1774
        $this->duplicateTrack = [];
1775
        $this->downloadUrls = [];
1776
1777
        // Drawing tree:
1778
        /* @var PageTreeView $tree */
1779
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1780
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1781
        $tree->init('AND ' . $perms_clause);
1782
1783
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1784
        if (is_array($pageInfo)) {
1785
            // Set root row:
1786
            $tree->tree[] = [
1787
                'row' => $pageInfo,
1788
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1789
            ];
1790
        }
1791
1792
        // Get branch beneath:
1793
        if ($depth) {
1794
            $tree->getTree($id, $depth, '');
1795
        }
1796
1797
        // Traverse page tree:
1798
        $code = '';
1799
1800
        foreach ($tree->tree as $data) {
1801
            $this->MP = false;
1802
1803
            // recognize mount points
1804
            if ($data['row']['doktype'] == 7) {
1805
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1806
1807
                // fetch mounted pages
1808
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
1809
1810
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1811
                $mountTree->init('AND ' . $perms_clause);
1812
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1813
1814
                foreach ($mountTree->tree as $mountData) {
1815
                    $code .= $this->drawURLs_addRowsForPage(
1816
                        $mountData['row'],
1817
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1818
                    );
1819
                }
1820
1821
                // replace page when mount_pid_ol is enabled
1822
                if ($mountpage[0]['mount_pid_ol']) {
1823
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1824
                } else {
1825
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1826
                    $this->MP = false;
1827
                }
1828
            }
1829
1830
            $code .= $this->drawURLs_addRowsForPage(
1831
                $data['row'],
1832
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1833
            );
1834
        }
1835
1836
        return $code;
1837
    }
1838
1839
    /**
1840
     * Expands exclude string
1841
     *
1842
     * @param string $excludeString Exclude string
1843
     * @return array
1844
     */
1845 1
    public function expandExcludeString($excludeString)
1846
    {
1847
        // internal static caches;
1848 1
        static $expandedExcludeStringCache;
1849 1
        static $treeCache;
1850
1851 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1852 1
            $pidList = [];
1853
1854 1
            if (!empty($excludeString)) {
1855
                /** @var PageTreeView $tree */
1856
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1857
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1858
1859
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1860
1861
                foreach ($excludeParts as $excludePart) {
1862
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1863
1864
                    // default is "page only" = "depth=0"
1865
                    if (empty($depth)) {
1866
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1867
                    }
1868
1869
                    $pidList[] = $pid;
1870
1871
                    if ($depth > 0) {
1872
                        if (empty($treeCache[$pid][$depth])) {
1873
                            $tree->reset();
1874
                            $tree->getTree($pid, $depth);
1875
                            $treeCache[$pid][$depth] = $tree->tree;
1876
                        }
1877
1878
                        foreach ($treeCache[$pid][$depth] as $data) {
1879
                            $pidList[] = $data['row']['uid'];
1880
                        }
1881
                    }
1882
                }
1883
            }
1884
1885 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1886
        }
1887
1888 1
        return $expandedExcludeStringCache[$excludeString];
1889
    }
1890
1891
    /**
1892
     * Create the rows for display of the page tree
1893
     * For each page a number of rows are shown displaying GET variable configuration
1894
     *
1895
     * @param    array        Page row
1896
     * @param    string        Page icon and title for row
1897
     * @return    string        HTML <tr> content (one or more)
1898
     */
1899
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1900
    {
1901
        $skipMessage = '';
1902
1903
        // Get list of configurations
1904
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1905
1906
        if (count($this->incomingConfigurationSelection) > 0) {
1907
            // remove configuration that does not match the current selection
1908
            foreach ($configurations as $confKey => $confArray) {
1909
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1910
                    unset($configurations[$confKey]);
1911
                }
1912
            }
1913
        }
1914
1915
        // Traverse parameter combinations:
1916
        $c = 0;
1917
        $content = '';
1918
        if (count($configurations)) {
1919
            foreach ($configurations as $confKey => $confArray) {
1920
1921
                    // Title column:
1922
                if (!$c) {
1923
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1924
                } else {
1925
                    $titleClm = '';
1926
                }
1927
1928
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1929
1930
                        // URL list:
1931
                    $urlList = $this->urlListFromUrlArray(
1932
                        $confArray,
1933
                        $pageRow,
1934
                        $this->scheduledTime,
1935
                        $this->reqMinute,
1936
                        $this->submitCrawlUrls,
1937
                        $this->downloadCrawlUrls,
1938
                        $this->duplicateTrack,
1939
                        $this->downloadUrls,
1940
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1941
                    );
1942
1943
                    // Expanded parameters:
1944
                    $paramExpanded = '';
1945
                    $calcAccu = [];
1946
                    $calcRes = 1;
1947
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1948
                        $paramExpanded .= '
1949
                            <tr>
1950
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1951
                                                '(' . count($gVal) . ')' .
1952
                                                '</td>
1953
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1954
                            </tr>
1955
                        ';
1956
                        $calcRes *= count($gVal);
1957
                        $calcAccu[] = count($gVal);
1958
                    }
1959
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1960
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1961
1962
                    // Options
1963
                    $optionValues = '';
1964
                    if ($confArray['subCfg']['userGroups']) {
1965
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1966
                    }
1967
                    if ($confArray['subCfg']['baseUrl']) {
1968
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
1969
                    }
1970
                    if ($confArray['subCfg']['procInstrFilter']) {
1971
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1972
                    }
1973
1974
                    // Compile row:
1975
                    $content .= '
1976
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1977
                            ' . $titleClm . '
1978
                            <td>' . htmlspecialchars($confKey) . '</td>
1979
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1980
                            <td>' . $paramExpanded . '</td>
1981
                            <td nowrap="nowrap">' . $urlList . '</td>
1982
                            <td nowrap="nowrap">' . $optionValues . '</td>
1983
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1984
                        </tr>';
1985
                } else {
1986
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1987
                            ' . $titleClm . '
1988
                            <td>' . htmlspecialchars($confKey) . '</td>
1989
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1990
                        </tr>';
1991
                }
1992
1993
                $c++;
1994
            }
1995
        } else {
1996
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1997
1998
            // Compile row:
1999
            $content .= '
2000
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2001
                    <td>' . $pageTitleAndIcon . '</td>
2002
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2003
                </tr>';
2004
        }
2005
2006
        return $content;
2007
    }
2008
2009
    /*****************************
2010
     *
2011
     * CLI functions
2012
     *
2013
     *****************************/
2014
2015
    /**
2016
     * Main function for running from Command Line PHP script (cron job)
2017
     * See ext/crawler/cli/crawler_cli.phpsh for details
2018
     *
2019
     * @return int number of remaining items or false if error
2020
     */
2021
    public function CLI_main()
2022
    {
2023
        $this->setAccessMode('cli');
2024
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2025
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2026
2027
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2028
            $cliObj->cli_validateArgs();
2029
            $cliObj->cli_help();
2030
            exit;
2031
        }
2032
2033
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2034
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2035
            // Seconds
2036
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2037
            // Milliseconds
2038
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2039
2040
            try {
2041
                // Run process:
2042
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2043
            } catch (\Exception $e) {
2044
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2045
                $result = self::CLI_STATUS_ABORTED;
2046
            }
2047
2048
            // Cleanup
2049
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2050
2051
            //TODO can't we do that in a clean way?
2052
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
2053
2054
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2055
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2056
        } else {
2057
            $result |= self::CLI_STATUS_ABORTED;
2058
        }
2059
2060
        return $result;
2061
    }
2062
2063
    /**
2064
     * Function executed by crawler_im.php cli script.
2065
     *
2066
     * @return void
2067
     */
2068
    public function CLI_main_im()
2069
    {
2070
        $this->setAccessMode('cli_im');
2071
2072
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2073
2074
        // Force user to admin state and set workspace to "Live":
2075
        $this->backendUser->user['admin'] = 1;
2076
        $this->backendUser->setWorkspace(0);
2077
2078
        // Print help
2079
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2080
            $cliObj->cli_validateArgs();
2081
            $cliObj->cli_help();
2082
            exit;
2083
        }
2084
2085
        $cliObj->cli_validateArgs();
2086
2087
        if ($cliObj->cli_argValue('-o') === 'exec') {
2088
            $this->registerQueueEntriesInternallyOnly = true;
2089
        }
2090
2091
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2092
            // Crawler is called over TYPO3 BE
2093
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2094
        } else {
2095
            // Crawler is called over cli
2096
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2097
        }
2098
2099
        $configurationKeys = $this->getConfigurationKeys($cliObj);
2100
2101
        if (!is_array($configurationKeys)) {
2102
            $configurations = $this->getUrlsForPageId($pageId);
2103
            if (is_array($configurations)) {
2104
                $configurationKeys = array_keys($configurations);
2105
            } else {
2106
                $configurationKeys = [];
2107
            }
2108
        }
2109
2110
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2111
            $reason = new Reason();
2112
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2113
            $reason->setDetailText('The cli script of the crawler added to the queue');
2114
            EventDispatcher::getInstance()->post(
2115
                'invokeQueueChange',
2116
                $this->setID,
2117
                ['reason' => $reason]
2118
            );
2119
        }
2120
2121
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2122
            $this->cleanUpOldQueueEntries();
2123
        }
2124
2125
        $this->setID = (int) GeneralUtility::md5int(microtime());
2126
        $this->getPageTreeAndUrls(
2127
            $pageId,
2128
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2129
            $this->getCurrentTime(),
2130
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2131
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2132
            $cliObj->cli_argValue('-o') === 'url',
2133
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2134
            $configurationKeys
2135
        );
2136
2137
        if ($cliObj->cli_argValue('-o') === 'url') {
2138
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2139
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2140
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2141
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2142
            $cliObj->cli_echo("\nProcessing:\n");
2143
2144
            foreach ($this->queueEntries as $queueRec) {
2145
                $p = unserialize($queueRec['parameters']);
2146
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2147
2148
                $result = $this->readUrlFromArray($queueRec);
2149
2150
                $requestResult = unserialize($result['content']);
2151
                if (is_array($requestResult)) {
2152
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2153
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2154
                } else {
2155
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2156
                }
2157
            }
2158
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2159
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2160
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2161
        } else {
2162
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2163
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2164
        }
2165
    }
2166
2167
    /**
2168
     * Function executed by crawler_im.php cli script.
2169
     *
2170
     * @return bool
2171
     */
2172
    public function CLI_main_flush()
2173
    {
2174
        $this->setAccessMode('cli_flush');
2175
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2176
2177
        // Force user to admin state and set workspace to "Live":
2178
        $this->backendUser->user['admin'] = 1;
2179
        $this->backendUser->setWorkspace(0);
2180
2181
        // Print help
2182
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2183
            $cliObj->cli_validateArgs();
2184
            $cliObj->cli_help();
2185
            exit;
2186
        }
2187
2188
        $cliObj->cli_validateArgs();
2189
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2190
        $fullFlush = ($pageId == 0);
2191
2192
        $mode = $cliObj->cli_argValue('-o');
2193
2194
        switch ($mode) {
2195
            case 'all':
2196
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2197
                break;
2198
            case 'finished':
2199
            case 'pending':
2200
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2201
                break;
2202
            default:
2203
                $cliObj->cli_validateArgs();
2204
                $cliObj->cli_help();
2205
                $result = false;
2206
        }
2207
2208
        return $result !== false;
2209
    }
2210
2211
    /**
2212
     * Obtains configuration keys from the CLI arguments
2213
     *
2214
     * @param  QueueCommandLineController $cliObj    Command line object
2215
     * @return mixed                        Array of keys or null if no keys found
2216
     */
2217
    protected function getConfigurationKeys(QueueCommandLineController &$cliObj)
2218
    {
2219
        $parameter = trim($cliObj->cli_argValue('-conf'));
2220
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2221
    }
2222
2223
    /**
2224
     * Running the functionality of the CLI (crawling URLs from queue)
2225
     *
2226
     * @param int $countInARun
2227
     * @param int $sleepTime
2228
     * @param int $sleepAfterFinish
2229
     * @return string
2230
     */
2231
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2232
    {
2233
        $result = 0;
2234
        $counter = 0;
2235
2236
        // First, run hooks:
2237
        $this->CLI_runHooks();
2238
2239
        // Clean up the queue
2240
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2241
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2242
            $del = $this->db->exec_DELETEquery(
2243
                'tx_crawler_queue',
2244
                'exec_time!=0 AND exec_time<' . $purgeDate
2245
            );
2246
            if (false == $del) {
2247
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2248
            }
2249
        }
2250
2251
        // Select entries:
2252
        //TODO Shouldn't this reside within the transaction?
2253
        $rows = $this->db->exec_SELECTgetRows(
2254
            'qid,scheduled',
2255
            'tx_crawler_queue',
2256
            'exec_time=0
2257
                AND process_scheduled= 0
2258
                AND scheduled<=' . $this->getCurrentTime(),
2259
            '',
2260
            'scheduled, qid',
2261
        intval($countInARun)
2262
        );
2263
2264
        if (count($rows) > 0) {
2265
            $quidList = [];
2266
2267
            foreach ($rows as $r) {
2268
                $quidList[] = $r['qid'];
2269
            }
2270
2271
            $processId = $this->CLI_buildProcessId();
2272
2273
            //reserve queue entries for process
2274
            $this->db->sql_query('BEGIN');
2275
            //TODO make sure we're not taking assigned queue-entires
2276
            $this->db->exec_UPDATEquery(
2277
                'tx_crawler_queue',
2278
                'qid IN (' . implode(',', $quidList) . ')',
2279
                [
2280
                    'process_scheduled' => intval($this->getCurrentTime()),
2281
                    'process_id' => $processId
2282
                ]
2283
            );
2284
2285
            //save the number of assigned queue entrys to determine who many have been processed later
2286
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2287
            $this->db->exec_UPDATEquery(
2288
                'tx_crawler_process',
2289
                "process_id = '" . $processId . "'",
2290
                [
2291
                    'assigned_items_count' => intval($numberOfAffectedRows)
2292
                ]
2293
            );
2294
2295
            if ($numberOfAffectedRows == count($quidList)) {
2296
                $this->db->sql_query('COMMIT');
2297
            } else {
2298
                $this->db->sql_query('ROLLBACK');
2299
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2300
                return ($result | self::CLI_STATUS_ABORTED);
2301
            }
2302
2303
            foreach ($rows as $r) {
2304
                $result |= $this->readUrl($r['qid']);
2305
2306
                $counter++;
2307
                usleep(intval($sleepTime)); // Just to relax the system
2308
2309
                // if during the start and the current read url the cli has been disable we need to return from the function
2310
                // mark the process NOT as ended.
2311
                if ($this->getDisabled()) {
2312
                    return ($result | self::CLI_STATUS_ABORTED);
2313
                }
2314
2315
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2316
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2317
2318
                    //TODO might need an additional returncode
2319
                    $result |= self::CLI_STATUS_ABORTED;
2320
                    break; //possible timeout
2321
                }
2322
            }
2323
2324
            sleep(intval($sleepAfterFinish));
2325
2326
            $msg = 'Rows: ' . $counter;
2327
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2328
        } else {
2329
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2330
        }
2331
2332
        if ($counter > 0) {
2333
            $result |= self::CLI_STATUS_PROCESSED;
2334
        }
2335
2336
        return $result;
2337
    }
2338
2339
    /**
2340
     * Activate hooks
2341
     *
2342
     * @return void
2343
     */
2344
    public function CLI_runHooks()
2345
    {
2346
        global $TYPO3_CONF_VARS;
2347
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2348
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2349
                $hookObj = &GeneralUtility::getUserObj($objRef);
2350
                if (is_object($hookObj)) {
2351
                    $hookObj->crawler_init($this);
2352
                }
2353
            }
2354
        }
2355
    }
2356
2357
    /**
2358
     * Try to acquire a new process with the given id
2359
     * also performs some auto-cleanup for orphan processes
2360
     * @todo preemption might not be the most elegant way to clean up
2361
     *
2362
     * @param string $id identification string for the process
2363
     * @return boolean
2364
     */
2365
    public function CLI_checkAndAcquireNewProcess($id)
2366
    {
2367
        $ret = true;
2368
2369
        $systemProcessId = getmypid();
2370
        if ($systemProcessId < 1) {
2371
            return false;
2372
        }
2373
2374
        $processCount = 0;
2375
        $orphanProcesses = [];
2376
2377
        $this->db->sql_query('BEGIN');
2378
2379
        $res = $this->db->exec_SELECTquery(
2380
            'process_id,ttl',
2381
            'tx_crawler_process',
2382
            'active=1 AND deleted=0'
2383
            );
2384
2385
        $currentTime = $this->getCurrentTime();
2386
2387
        while ($row = $this->db->sql_fetch_assoc($res)) {
2388
            if ($row['ttl'] < $currentTime) {
2389
                $orphanProcesses[] = $row['process_id'];
2390
            } else {
2391
                $processCount++;
2392
            }
2393
        }
2394
2395
        // if there are less than allowed active processes then add a new one
2396
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2397
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2398
2399
            // create new process record
2400
            $this->db->exec_INSERTquery(
2401
                'tx_crawler_process',
2402
                [
2403
                    'process_id' => $id,
2404
                    'active' => '1',
2405
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2406
                    'system_process_id' => $systemProcessId
2407
                ]
2408
                );
2409
        } else {
2410
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2411
            $ret = false;
2412
        }
2413
2414
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2415
        $this->CLI_deleteProcessesMarkedDeleted();
2416
2417
        $this->db->sql_query('COMMIT');
2418
2419
        return $ret;
2420
    }
2421
2422
    /**
2423
     * Release a process and the required resources
2424
     *
2425
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2426
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2427
     * @return boolean
2428
     */
2429
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2430
    {
2431
        if (!is_array($releaseIds)) {
2432
            $releaseIds = [$releaseIds];
2433
        }
2434
2435
        if (!count($releaseIds) > 0) {
2436
            return false;   //nothing to release
2437
        }
2438
2439
        if (!$withinLock) {
2440
            $this->db->sql_query('BEGIN');
2441
        }
2442
2443
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2444
        // this ensures that a single process can't mess up the entire process table
2445
2446
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2447
        $this->db->exec_UPDATEquery(
2448
            'tx_crawler_queue',
2449
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2450
            [
2451
                'process_scheduled' => 0,
2452
                'process_id' => ''
2453
            ]
2454
        );
2455
        $this->db->exec_UPDATEquery(
2456
            'tx_crawler_process',
2457
            'active=0 AND deleted=0
2458
            AND NOT EXISTS (
2459
                SELECT * FROM tx_crawler_queue
2460
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2461
                AND tx_crawler_queue.exec_time = 0
2462
            )',
2463
            [
2464
                'deleted' => '1',
2465
                'system_process_id' => 0
2466
            ]
2467
        );
2468
        // mark all requested processes as non-active
2469
        $this->db->exec_UPDATEquery(
2470
            'tx_crawler_process',
2471
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2472
            [
2473
                'active' => '0'
2474
            ]
2475
        );
2476
        $this->db->exec_UPDATEquery(
2477
            'tx_crawler_queue',
2478
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2479
            [
2480
                'process_scheduled' => 0,
2481
                'process_id' => ''
2482
            ]
2483
        );
2484
2485
        if (!$withinLock) {
2486
            $this->db->sql_query('COMMIT');
2487
        }
2488
2489
        return true;
2490
    }
2491
2492
    /**
2493
     * Delete processes marked as deleted
2494
     *
2495
     * @return void
2496
     */
2497 1
    public function CLI_deleteProcessesMarkedDeleted()
2498
    {
2499 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2500 1
    }
2501
2502
    /**
2503
     * Check if there are still resources left for the process with the given id
2504
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2505
     *
2506
     * @param  string  identification string for the process
2507
     * @return boolean determines if the process is still active / has resources
2508
     *
2509
     * FIXME: Please remove Transaction, not needed as only a select query.
2510
     */
2511
    public function CLI_checkIfProcessIsActive($pid)
2512
    {
2513
        $ret = false;
2514
        $this->db->sql_query('BEGIN');
2515
        $res = $this->db->exec_SELECTquery(
2516
            'process_id,active,ttl',
2517
            'tx_crawler_process',
2518
            'process_id = \'' . $pid . '\'  AND deleted=0',
2519
            '',
2520
            'ttl',
2521
            '0,1'
2522
        );
2523
        if ($row = $this->db->sql_fetch_assoc($res)) {
2524
            $ret = intVal($row['active']) == 1;
2525
        }
2526
        $this->db->sql_query('COMMIT');
2527
2528
        return $ret;
2529
    }
2530
2531
    /**
2532
     * Create a unique Id for the current process
2533
     *
2534
     * @return string  the ID
2535
     */
2536 2
    public function CLI_buildProcessId()
2537
    {
2538 2
        if (!$this->processID) {
2539 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2540
        }
2541 2
        return $this->processID;
2542
    }
2543
2544
    /**
2545
     * @param bool $get_as_float
2546
     *
2547
     * @return mixed
2548
     */
2549
    protected function microtime($get_as_float = false)
2550
    {
2551
        return microtime($get_as_float);
2552
    }
2553
2554
    /**
2555
     * Prints a message to the stdout (only if debug-mode is enabled)
2556
     *
2557
     * @param  string $msg  the message
2558
     */
2559
    public function CLI_debug($msg)
2560
    {
2561
        if (intval($this->extensionSettings['processDebug'])) {
2562
            echo $msg . "\n";
2563
            flush();
2564
        }
2565
    }
2566
2567
    /**
2568
     * Get URL content by making direct request to TYPO3.
2569
     *
2570
     * @param  string $url          Page URL
2571
     * @param  int    $crawlerId    Crawler-ID
2572
     * @return array
2573
     */
2574 2
    protected function sendDirectRequest($url, $crawlerId)
2575
    {
2576 2
        $parsedUrl = parse_url($url);
2577 2
        if (!is_array($parsedUrl)) {
2578
            return [];
2579
        }
2580
2581 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2582
2583 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2584 2
        $cmd .= ' ';
2585 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2586 2
        $cmd .= ' ';
2587 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2588 2
        $cmd .= ' ';
2589 2
        $cmd .= escapeshellarg($url);
2590 2
        $cmd .= ' ';
2591 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2592
2593 2
        $startTime = microtime(true);
2594 2
        $content = $this->executeShellCommand($cmd);
2595 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2596
2597
        $result = [
2598 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2599 2
            'headers' => '',
2600 2
            'content' => $content
2601
        ];
2602
2603 2
        return $result;
2604
    }
2605
2606
    /**
2607
     * Cleans up entries that stayed for too long in the queue. These are:
2608
     * - processed entries that are over 1.5 days in age
2609
     * - scheduled entries that are over 7 days old
2610
     *
2611
     * @return void
2612
     */
2613
    protected function cleanUpOldQueueEntries()
2614
    {
2615
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2616
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2617
2618
        $now = time();
2619
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2620
        $this->flushQueue($condition);
2621
    }
2622
2623
    /**
2624
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2625
     *
2626
     * @param int $id
2627
     * @param int $typeNum
2628
     *
2629
     * @return void
2630
     */
2631
    protected function initTSFE($id = 1, $typeNum = 0)
2632
    {
2633
        EidUtility::initTCA();
2634
        if (!is_object($GLOBALS['TT'])) {
2635
            $GLOBALS['TT'] = new NullTimeTracker();
2636
            $GLOBALS['TT']->start();
2637
        }
2638
2639
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2640
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2641
        $GLOBALS['TSFE']->sys_page->init(true);
2642
        $GLOBALS['TSFE']->connectToDB();
2643
        $GLOBALS['TSFE']->initFEuser();
2644
        $GLOBALS['TSFE']->determineId();
2645
        $GLOBALS['TSFE']->initTemplate();
2646
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2647
        $GLOBALS['TSFE']->getConfigArray();
2648
        PageGenerator::pagegenInit();
2649
    }
2650
2651
    /**
2652
     * Returns a md5 hash generated from a serialized configuration array.
2653
     *
2654
     * @param array $configuration
2655
     *
2656
     * @return string
2657
     */
2658 9
    protected function getConfigurationHash(array $configuration) {
2659 9
        unset($configuration['paramExpanded']);
2660 9
        unset($configuration['URLs']);
2661 9
        return md5(serialize($configuration));
2662
    }
2663
2664
    /**
2665
     * Check whether the Crawling Protocol should be http or https
2666
     *
2667
     * @param $crawlerConfiguration
2668
     * @param $pageConfiguration
2669
     *
2670
     * @return bool
2671
     */
2672 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2673
        switch($crawlerConfiguration) {
2674 6
            case -1:
2675 1
                return false;
2676 5
            case 0:
2677 3
                return $pageConfiguration;
2678 2
            case 1:
2679 1
                return true;
2680
            default:
2681 1
                return false;
2682
        }
2683
    }
2684
}
2685