Completed
Push — master ( 02b531...a2dda1 )
by Tomas Norre
05:54
created

Classes/Controller/CrawlerController.php (2 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\IconUtility;
35
use AOE\Crawler\Utility\SignalSlotUtility;
36
use TYPO3\CMS\Backend\Utility\BackendUtility;
37
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Database\DatabaseConnection;
40
use TYPO3\CMS\Core\Log\LogLevel;
41
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
42
use TYPO3\CMS\Core\Utility\DebugUtility;
43
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
44
use TYPO3\CMS\Core\Utility\GeneralUtility;
45
use TYPO3\CMS\Core\Utility\MathUtility;
46
use TYPO3\CMS\Extbase\Object\ObjectManager;
47
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
48
use TYPO3\CMS\Frontend\Page\PageGenerator;
49
use TYPO3\CMS\Frontend\Page\PageRepository;
50
use TYPO3\CMS\Frontend\Utility\EidUtility;
51
use TYPO3\CMS\Lang\LanguageService;
52
53
/**
54
 * Class CrawlerController
55
 *
56
 * @package AOE\Crawler\Controller
57
 */
58
class CrawlerController
59
{
60
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
61
    const CLI_STATUS_REMAIN = 1; //queue not empty
62
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
63
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
64
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
65
66
    /**
67
     * @var integer
68
     */
69
    public $setID = 0;
70
71
    /**
72
     * @var string
73
     */
74
    public $processID = '';
75
76
    /**
77
     * One hour is max stalled time for the CLI
78
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
79
     *
80
     * @var integer
81
     */
82
    public $max_CLI_exec_time = 3600;
83
84
    /**
85
     * @var array
86
     */
87
    public $duplicateTrack = [];
88
89
    /**
90
     * @var array
91
     */
92
    public $downloadUrls = [];
93
94
    /**
95
     * @var array
96
     */
97
    public $incomingProcInstructions = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $incomingConfigurationSelection = [];
103
104
    /**
105
     * @var bool
106
     */
107
    public $registerQueueEntriesInternallyOnly = false;
108
109
    /**
110
     * @var array
111
     */
112
    public $queueEntries = [];
113
114
    /**
115
     * @var array
116
     */
117
    public $urlList = [];
118
119
    /**
120
     * @var boolean
121
     */
122
    public $debugMode = false;
123
124
    /**
125
     * @var array
126
     */
127
    public $extensionSettings = [];
128
129
    /**
130
     * Mount Point
131
     *
132
     * @var boolean
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var DatabaseConnection
150
     */
151
    private $db;
152
153
    /**
154
     * @var BackendUserAuthentication
155
     */
156
    private $backendUser;
157
158
    /**
159
     * @var integer
160
     */
161
    private $scheduledTime = 0;
162
163
    /**
164
     * @var integer
165
     */
166
    private $reqMinute = 0;
167
168
    /**
169
     * @var bool
170
     */
171
    private $submitCrawlUrls = false;
172
173
    /**
174
     * @var bool
175
     */
176
    private $downloadCrawlUrls = false;
177
178
    /**
179
     * @var QueueRepository
180
     */
181
    protected  $queueRepository;
182
183
    /**
184
     * Method to set the accessMode can be gui, cli or cli_im
185
     *
186
     * @return string
187
     */
188 1
    public function getAccessMode()
189
    {
190 1
        return $this->accessMode;
191
    }
192
193
    /**
194
     * @param string $accessMode
195
     */
196 1
    public function setAccessMode($accessMode)
197
    {
198 1
        $this->accessMode = $accessMode;
199 1
    }
200
201
    /**
202
     * Set disabled status to prevent processes from being processed
203
     *
204
     * @param  bool $disabled (optional, defaults to true)
205
     * @return void
206
     */
207 3
    public function setDisabled($disabled = true)
208
    {
209 3
        if ($disabled) {
210 2
            GeneralUtility::writeFile($this->processFilename, '');
211
        } else {
212 1
            if (is_file($this->processFilename)) {
213 1
                unlink($this->processFilename);
214
            }
215
        }
216 3
    }
217
218
    /**
219
     * Get disable status
220
     *
221
     * @return bool true if disabled
222
     */
223 3
    public function getDisabled()
224
    {
225 3
        if (is_file($this->processFilename)) {
226 2
            return true;
227
        } else {
228 1
            return false;
229
        }
230
    }
231
232
    /**
233
     * @param string $filenameWithPath
234
     *
235
     * @return void
236
     */
237 4
    public function setProcessFilename($filenameWithPath)
238
    {
239 4
        $this->processFilename = $filenameWithPath;
240 4
    }
241
242
    /**
243
     * @return string
244
     */
245 1
    public function getProcessFilename()
246
    {
247 1
        return $this->processFilename;
248
    }
249
250
    /************************************
251
     *
252
     * Getting URLs based on Page TSconfig
253
     *
254
     ************************************/
255
256 28
    public function __construct()
257
    {
258 28
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
259 28
        $this->queueRepository = $objectManager->get(QueueRepository::class);
260
261 28
        $this->db = $GLOBALS['TYPO3_DB'];
262 28
        $this->backendUser = $GLOBALS['BE_USER'];
263 28
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
264
265 28
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
266 28
        $settings = is_array($settings) ? $settings : [];
267
268
        // read ext_em_conf_template settings and set
269 28
        $this->setExtensionSettings($settings);
270
271
        // set defaults:
272 28
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
273 21
            $this->extensionSettings['countInARun'] = 100;
274
        }
275
276 28
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
277 28
    }
278
279
    /**
280
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
281
     *
282
     * @param array $extensionSettings
283
     * @return void
284
     */
285 37
    public function setExtensionSettings(array $extensionSettings)
286
    {
287 37
        $this->extensionSettings = $extensionSettings;
288 37
    }
289
290
    /**
291
     * Check if the given page should be crawled
292
     *
293
     * @param array $pageRow
294
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
295
     */
296 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
297
    {
298 10
        $skipPage = false;
299 10
        $skipMessage = 'Skipped'; // message will be overwritten later
300
301
        // if page is hidden
302 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
303 10
            if ($pageRow['hidden']) {
304 1
                $skipPage = true;
305 1
                $skipMessage = 'Because page is hidden';
306
            }
307
        }
308
309 10
        if (!$skipPage) {
310 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
311 3
                $skipPage = true;
312 3
                $skipMessage = 'Because doktype is not allowed';
313
            }
314
        }
315
316 10
        if (!$skipPage) {
317 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
318 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
319 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
320 1
                        $skipPage = true;
321 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
322 1
                        break;
323
                    }
324
                }
325
            }
326
        }
327
328 10
        if (!$skipPage) {
329
            // veto hook
330 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
331
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
332
                    $params = [
333
                        'pageRow' => $pageRow
334
                    ];
335
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
336
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
337
                    if ($veto !== false) {
338
                        $skipPage = true;
339
                        if (is_string($veto)) {
340
                            $skipMessage = $veto;
341
                        } else {
342
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
343
                        }
344
                        // no need to execute other hooks if a previous one return a veto
345
                        break;
346
                    }
347
                }
348
            }
349
        }
350
351 10
        return $skipPage ? $skipMessage : false;
352
    }
353
354
    /**
355
     * Wrapper method for getUrlsForPageId()
356
     * It returns an array of configurations and no urls!
357
     *
358
     * @param array $pageRow Page record with at least dok-type and uid columns.
359
     * @param string $skipMessage
360
     * @return array
361
     * @see getUrlsForPageId()
362
     */
363 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
364
    {
365 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
366
367 6
        if ($message === false) {
368 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
369 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
370 5
            $skipMessage = '';
371
        } else {
372 1
            $skipMessage = $message;
373 1
            $res = [];
374
        }
375
376 6
        return $res;
377
    }
378
379
    /**
380
     * This method is used to count if there are ANY unprocessed queue entries
381
     * of a given page_id and the configuration which matches a given hash.
382
     * If there if none, we can skip an inner detail check
383
     *
384
     * @param  int $uid
385
     * @param  string $configurationHash
386
     * @return boolean
387
     */
388 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
389
    {
390 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
391 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
392 7
        $row = $this->db->sql_fetch_assoc($res);
393
394 7
        return ($row['anz'] == 0);
395
    }
396
397
    /**
398
     * Creates a list of URLs from input array (and submits them to queue if asked for)
399
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
400
     *
401
     * @param    array        Information about URLs from pageRow to crawl.
402
     * @param    array        Page row
403
     * @param    integer        Unix time to schedule indexing to, typically time()
404
     * @param    integer        Number of requests per minute (creates the interleave between requests)
405
     * @param    boolean        If set, submits the URLs to queue
406
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
407
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
408
     * @param    array        Array which will be filled with URLS for download if flag is set.
409
     * @param    array        Array of processing instructions
410
     * @return    string        List of URLs (meant for display in backend module)
411
     *
412
     */
413 4
    public function urlListFromUrlArray(
414
    array $vv,
415
    array $pageRow,
416
    $scheduledTime,
417
    $reqMinute,
418
    $submitCrawlUrls,
419
    $downloadCrawlUrls,
420
    array &$duplicateTrack,
421
    array &$downloadUrls,
422
    array $incomingProcInstructions
423
    ) {
424 4
        $urlList = '';
425
        // realurl support (thanks to Ingo Renner)
426 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
427
428
            /** @var tx_realurl $urlObj */
429
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
430
431
            if (!empty($vv['subCfg']['baseUrl'])) {
432
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
433
                $host = strtolower($urlParts['host']);
434
                $urlObj->host = $host;
435
436
                // First pass, finding configuration OR pointer string:
437
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
438
439
                // If it turned out to be a string pointer, then look up the real config:
440
                if (is_string($urlObj->extConf)) {
441
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
442
                }
443
            }
444
445
            if (!$GLOBALS['TSFE']->sys_page) {
446
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
447
            }
448
            if (!$GLOBALS['TSFE']->csConvObj) {
449
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
450
            }
451
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
452
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
453
            }
454
        }
455
456 4
        if (is_array($vv['URLs'])) {
457 4
            $configurationHash = $this->getConfigurationHash($vv);
458 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
459
460 4
            foreach ($vv['URLs'] as $urlQuery) {
461 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
462
463
                    // Calculate cHash:
464 4
                    if ($vv['subCfg']['cHash']) {
465
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
466
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
467
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
468
                    }
469
470
                    // Create key by which to determine unique-ness:
471 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
472
473
                    // realurl support (thanks to Ingo Renner)
474 4
                    $urlQuery = 'index.php' . $urlQuery;
475 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
476
                        $params = [
477
                            'LD' => [
478
                                'totalURL' => $urlQuery
479
                            ],
480
                            'TCEmainHook' => true
481
                        ];
482
                        $urlObj->encodeSpURL($params);
483
                        $urlQuery = $params['LD']['totalURL'];
484
                    }
485
486
                    // Scheduled time:
487 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
488 4
                    $schTime = floor($schTime / 60) * 60;
489
490 4
                    if (isset($duplicateTrack[$uKey])) {
491
492
                        //if the url key is registered just display it and do not resubmit is
493
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
494
                    } else {
495 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
496 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
497
498 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
499
500
                        // Submit for crawling!
501 4
                        if ($submitCrawlUrls) {
502 4
                            $added = $this->addUrl(
503 4
                            $pageRow['uid'],
504 4
                            $theUrl,
505 4
                            $vv['subCfg'],
506 4
                            $scheduledTime,
507 4
                            $configurationHash,
508 4
                            $skipInnerCheck
509
                            );
510 4
                            if ($added === false) {
511 4
                                $urlList .= ' (Url already existed)';
512
                            }
513
                        } elseif ($downloadCrawlUrls) {
514
                            $downloadUrls[$theUrl] = $theUrl;
515
                        }
516
517 4
                        $urlList .= '<br />';
518
                    }
519 4
                    $duplicateTrack[$uKey] = true;
520
                }
521
            }
522
        } else {
523
            $urlList = 'ERROR - no URL generated';
524
        }
525
526 4
        return $urlList;
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 5
        if (empty($incomingProcInstructions)) {
539 1
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 4
                return true;
545
            }
546
        }
547 2
    }
548
549 4
    public function getPageTSconfigForId($id)
550
    {
551 4
        if (!$this->MP) {
552 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            list(, $mountPointId) = explode('-', $this->MP);
555
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
556
        }
557
558
        // Call a hook to alter configuration
559 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
560
            $params = [
561
                'pageId' => $id,
562
                'pageTSConfig' => &$pageTSconfig
563
            ];
564
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
565
                GeneralUtility::callUserFunction($userFunc, $params, $this);
566
            }
567
        }
568
569 4
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * And no urls!
575
     *
576
     * @param integer $id Page ID
577
     * @param bool $forceSsl Use https
578
     * @return array
579
     *
580
     * TODO: Should be switched back to protected - TNM 2018-11-16
581
     */
582 4
    public function getUrlsForPageId($id, $forceSsl = false)
583
    {
584
585
        /**
586
         * Get configuration from tsConfig
587
         */
588
589
        // Get page TSconfig for page ID:
590 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
591
592 4
        $res = [];
593
594 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
595 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
596
597 3
            if (is_array($crawlerCfg['paramSets.'])) {
598 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
599 3
                    if (is_array($values)) {
600 3
                        $key = str_replace('.', '', $key);
601
                        // Sub configuration for a single configuration string:
602 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
603 3
                        $subCfg['key'] = $key;
604
605 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
606 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
607
                        }
608 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
609
610
                        // process configuration if it is not page-specific or if the specific page is the current page:
611 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
612
613
                                // add trailing slash if not present
614 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
615
                                $subCfg['baseUrl'] .= '/';
616
                            }
617
618
                            // Explode, process etc.:
619 3
                            $res[$key] = [];
620 3
                            $res[$key]['subCfg'] = $subCfg;
621 3
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
622 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
623 3
                            $res[$key]['origin'] = 'pagets';
624
625
                            // recognize MP value
626 3
                            if (!$this->MP) {
627 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
628
                            } else {
629 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
630
                            }
631
                        }
632
                    }
633
                }
634
            }
635
        }
636
637
        /**
638
         * Get configuration from tx_crawler_configuration records
639
         */
640
641
        // get records along the rootline
642 4
        $rootLine = BackendUtility::BEgetRootLine($id);
643
644 4
        foreach ($rootLine as $page) {
645 4
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
646 4
                'tx_crawler_configuration',
647 4
                'pid',
648 4
                intval($page['uid']),
649 4
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
650
            );
651
652 4
            if (is_array($configurationRecordsForCurrentPage)) {
653 1
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
654
655
                        // check access to the configuration record
656 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
657 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
658
659
                        // process configuration if it is not page-specific or if the specific page is the current page:
660 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
661 1
                            $key = $configurationRecord['name'];
662
663
                            // don't overwrite previously defined paramSets
664 1
                            if (!isset($res[$key])) {
665
666
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
667 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
668 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
669
670 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
671
672
                                $subCfg = [
673 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
674 1
                                    'procInstrParams.' => $TSparserObject->setup,
675 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
676 1
                                        $configurationRecord['base_url'],
677 1
                                        $configurationRecord['sys_domain_base_url'],
678 1
                                        $isCrawlingProtocolHttps
679
                                    ),
680 1
                                    'realurl' => $configurationRecord['realurl'],
681 1
                                    'cHash' => $configurationRecord['chash'],
682 1
                                    'userGroups' => $configurationRecord['fegroups'],
683 1
                                    'exclude' => $configurationRecord['exclude'],
684 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
685 1
                                    'key' => $key
686
                                ];
687
688
                                // add trailing slash if not present
689 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
690
                                    $subCfg['baseUrl'] .= '/';
691
                                }
692 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
693 1
                                    $res[$key] = [];
694 1
                                    $res[$key]['subCfg'] = $subCfg;
695 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
696 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
697 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
698 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
699
                                }
700
                            }
701
                        }
702
                    }
703
                }
704
            }
705
        }
706
707 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
708
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
709
                $params = [
710
                    'res' => &$res,
711
                ];
712
                GeneralUtility::callUserFunction($func, $params, $this);
713
            }
714
        }
715
716 4
        return $res;
717
    }
718
719
    /**
720
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
721
     *
722
     * @param string $baseUrl
723
     * @param integer $sysDomainUid
724
     * @param bool $ssl
725
     * @return string
726
     */
727 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
728
    {
729 4
        $sysDomainUid = intval($sysDomainUid);
730 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
731
732 4
        if ($sysDomainUid > 0) {
733 2
            $res = $this->db->exec_SELECTquery(
734 2
                '*',
735 2
                'sys_domain',
736 2
                'uid = ' . $sysDomainUid .
737 2
                BackendUtility::BEenableFields('sys_domain') .
738 2
                BackendUtility::deleteClause('sys_domain')
739
            );
740 2
            $row = $this->db->sql_fetch_assoc($res);
741 2
            if ($row['domainName'] != '') {
742 1
                return $urlScheme . '://' . $row['domainName'];
743
            }
744
        }
745 3
        return $baseUrl;
746
    }
747
748
    public function getConfigurationsForBranch($rootid, $depth)
749
    {
750
        $configurationsForBranch = [];
751
752
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
753
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
754
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
755
            if (is_array($sets)) {
756
                foreach ($sets as $key => $value) {
757
                    if (!is_array($value)) {
758
                        continue;
759
                    }
760
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
761
                }
762
            }
763
        }
764
        $pids = [];
765
        $rootLine = BackendUtility::BEgetRootLine($rootid);
766
        foreach ($rootLine as $node) {
767
            $pids[] = $node['uid'];
768
        }
769
        /* @var PageTreeView $tree */
770
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
771
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
772
        $tree->init('AND ' . $perms_clause);
773
        $tree->getTree($rootid, $depth, '');
774
        foreach ($tree->tree as $node) {
775
            $pids[] = $node['row']['uid'];
776
        }
777
778
        $res = $this->db->exec_SELECTquery(
779
            '*',
780
            'tx_crawler_configuration',
781
            'pid IN (' . implode(',', $pids) . ') ' .
782
            BackendUtility::BEenableFields('tx_crawler_configuration') .
783
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
784
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
785
        );
786
787
        while ($row = $this->db->sql_fetch_assoc($res)) {
788
            $configurationsForBranch[] = $row['name'];
789
        }
790
        $this->db->sql_free_result($res);
791
        return $configurationsForBranch;
792
    }
793
794
    /**
795
     * Check if a user has access to an item
796
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
797
     *
798
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
799
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
800
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
801
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
802
     */
803 3
    public function hasGroupAccess($groupList, $accessList)
804
    {
805 3
        if (empty($accessList)) {
806 1
            return true;
807
        }
808 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
809 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
810 2
                return true;
811
            }
812
        }
813 1
        return false;
814
    }
815
816
    /**
817
     * Parse GET vars of input Query into array with key=>value pairs
818
     *
819
     * @param string $inputQuery Input query string
820
     * @return array
821
     */
822 7
    public function parseParams($inputQuery)
823
    {
824
        // Extract all GET parameters into an ARRAY:
825 7
        $paramKeyValues = [];
826 7
        $GETparams = explode('&', $inputQuery);
827
828 7
        foreach ($GETparams as $paramAndValue) {
829 7
            list($p, $v) = explode('=', $paramAndValue, 2);
830 7
            if (strlen($p)) {
831 7
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
832
            }
833
        }
834
835 7
        return $paramKeyValues;
836
    }
837
838
    /**
839
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
840
     * Syntax of values:
841
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
842
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
843
     * - For each configuration part:
844
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
845
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
846
     *        _ENABLELANG:1 picks only original records without their language overlays
847
     *         - Default: Literal value
848
     *
849
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
850
     * @param integer $pid Current page ID
851
     * @return array
852
     */
853 4
    public function expandParameters($paramArray, $pid)
854
    {
855 4
        global $TCA;
856
857
        // Traverse parameter names:
858 4
        foreach ($paramArray as $p => $v) {
859 4
            $v = trim($v);
860
861
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
862 4
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
863
                // So, find the value inside brackets and reset the paramArray value as an array.
864 4
                $v = substr($v, 1, -1);
865 4
                $paramArray[$p] = [];
866
867
                // Explode parts and traverse them:
868 4
                $parts = explode('|', $v);
869 4
                foreach ($parts as $pV) {
870
871
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
872 4
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
873
874
                        // Swap if first is larger than last:
875
                        if ($reg[1] > $reg[2]) {
876
                            $temp = $reg[2];
877
                            $reg[2] = $reg[1];
878
                            $reg[1] = $temp;
879
                        }
880
881
                        // Traverse range, add values:
882
                        $runAwayBrake = 1000; // Limit to size of range!
883
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
884
                            $paramArray[$p][] = $a;
885
                            $runAwayBrake--;
886
                            if ($runAwayBrake <= 0) {
887
                                break;
888
                            }
889
                        }
890 4
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
891
892
                        // Parse parameters:
893
                        $subparts = GeneralUtility::trimExplode(';', $pV);
894
                        $subpartParams = [];
895
                        foreach ($subparts as $spV) {
896
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
897
                            $subpartParams[$pKey] = $pVal;
898
                        }
899
900
                        // Table exists:
901
                        if (isset($TCA[$subpartParams['_TABLE']])) {
902
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
903
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
904
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
905
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
906
907
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
908
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
909
                                $andWhereLanguage = '';
910
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
911
912
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
913
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
914
                                }
915
916
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
917
                                    $andWhereLanguage . $where;
918
919
                                $rows = $this->db->exec_SELECTgetRows(
920
                                    $fieldName,
921
                                    $subpartParams['_TABLE'] . $addTable,
922
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
923
                                    '',
924
                                    '',
925
                                    '',
926
                                    $fieldName
927
                                );
928
929
                                if (is_array($rows)) {
930
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
931
                                }
932
                            }
933
                        }
934
                    } else { // Just add value:
935 4
                        $paramArray[$p][] = $pV;
936
                    }
937
                    // Hook for processing own expandParameters place holder
938 4
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
939
                        $_params = [
940
                            'pObj' => &$this,
941
                            'paramArray' => &$paramArray,
942
                            'currentKey' => $p,
943
                            'currentValue' => $pV,
944
                            'pid' => $pid
945
                        ];
946
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
947 4
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
948
                        }
949
                    }
950
                }
951
952
                // Make unique set of values and sort array by key:
953 4
                $paramArray[$p] = array_unique($paramArray[$p]);
954 4
                ksort($paramArray);
955
            } else {
956
                // Set the literal value as only value in array:
957 4
                $paramArray[$p] = [$v];
958
            }
959
        }
960
961 4
        return $paramArray;
962
    }
963
964
    /**
965
     * Compiling URLs from parameter array (output of expandParameters())
966
     * The number of URLs will be the multiplication of the number of parameter values for each key
967
     *
968
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
969
     * @param array $urls URLs accumulated in this array (for recursion)
970
     * @return array
971
     */
972 7
    public function compileUrls($paramArray, $urls = [])
973
    {
974 7
        if (count($paramArray) && is_array($urls)) {
975
            // shift first off stack:
976 6
            reset($paramArray);
977 6
            $varName = key($paramArray);
978 6
            $valueSet = array_shift($paramArray);
979
980
            // Traverse value set:
981 6
            $newUrls = [];
982 6
            foreach ($urls as $url) {
983 5
                foreach ($valueSet as $val) {
984 5
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
985
986 5
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
987 5
                        break;
988
                    }
989
                }
990
            }
991 6
            $urls = $newUrls;
992 6
            $urls = $this->compileUrls($paramArray, $urls);
993
        }
994
995 7
        return $urls;
996
    }
997
998
    /************************************
999
     *
1000
     * Crawler log
1001
     *
1002
     ************************************/
1003
1004
    /**
1005
     * Return array of records from crawler queue for input page ID
1006
     *
1007
     * @param integer $id Page ID for which to look up log entries.
1008
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1009
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1010
     * @param boolean $doFullFlush
1011
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1012
     * @return array
1013
     */
1014 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1015
    {
1016
        switch ($filter) {
1017 4
            case 'pending':
1018
                $addWhere = ' AND exec_time=0';
1019
                break;
1020 4
            case 'finished':
1021
                $addWhere = ' AND exec_time>0';
1022
                break;
1023
            default:
1024 4
                $addWhere = '';
1025 4
                break;
1026
        }
1027
1028
        // FIXME: Write unit test that ensures that the right records are deleted.
1029 4
        if ($doFlush) {
1030 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1031 2
            return [];
1032
        } else {
1033 2
            return $this->db->exec_SELECTgetRows(
1034 2
                '*',
1035 2
                'tx_crawler_queue',
1036 2
                'page_id=' . intval($id) . $addWhere,
1037 2
                '',
1038 2
                'scheduled DESC',
1039 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1040
            );
1041
        }
1042
    }
1043
1044
    /**
1045
     * Return array of records from crawler queue for input set ID
1046
     *
1047
     * @param integer $set_id Set ID for which to look up log entries.
1048
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1049
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1050
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1051
     * @return array
1052
     */
1053 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1054
    {
1055
        // FIXME: Write Unit tests for Filters
1056
        switch ($filter) {
1057 6
            case 'pending':
1058 1
                $addWhere = ' AND exec_time=0';
1059 1
                break;
1060 5
            case 'finished':
1061 1
                $addWhere = ' AND exec_time>0';
1062 1
                break;
1063
            default:
1064 4
                $addWhere = '';
1065 4
                break;
1066
        }
1067
        // FIXME: Write unit test that ensures that the right records are deleted.
1068 6
        if ($doFlush) {
1069 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1070 4
            return [];
1071
        } else {
1072 2
            return $this->db->exec_SELECTgetRows(
1073 2
                '*',
1074 2
                'tx_crawler_queue',
1075 2
                'set_id=' . intval($set_id) . $addWhere,
1076 2
                '',
1077 2
                'scheduled DESC',
1078 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1079
            );
1080
        }
1081
    }
1082
1083
    /**
1084
     * Removes queue entries
1085
     *
1086
     * @param string $where SQL related filter for the entries which should be removed
1087
     * @return void
1088
     */
1089 10
    protected function flushQueue($where = '')
1090
    {
1091 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1092
1093 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1094
            $groups = $GLOBALS['TYPO3_DB']>exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1095
            if (is_array($groups)) {
1096
                foreach ($groups as $group) {
1097
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"'));
1098
                }
1099
            }
1100
        }
1101
1102 10
        $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_queue', $realWhere);
1103 10
    }
1104
1105
    /**
1106
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1107
     *
1108
     * @param integer $setId Set ID
1109
     * @param array $params Parameters to pass to call back function
1110
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1111
     * @param integer $page_id Page ID to attach it to
1112
     * @param integer $schedule Time at which to activate
1113
     * @return void
1114
     */
1115
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1116
    {
1117
        if (!is_array($params)) {
1118
            $params = [];
1119
        }
1120
        $params['_CALLBACKOBJ'] = $callBack;
1121
1122
        // Compile value array:
1123
        $fieldArray = [
1124
            'page_id' => intval($page_id),
1125
            'parameters' => serialize($params),
1126
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1127
            'exec_time' => 0,
1128
            'set_id' => intval($setId),
1129
            'result_data' => '',
1130
        ];
1131
1132
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1133
    }
1134
1135
    /************************************
1136
     *
1137
     * URL setting
1138
     *
1139
     ************************************/
1140
1141
    /**
1142
     * Setting a URL for crawling:
1143
     *
1144
     * @param integer $id Page ID
1145
     * @param string $url Complete URL
1146
     * @param array $subCfg Sub configuration array (from TS config)
1147
     * @param integer $tstamp Scheduled-time
1148
     * @param string $configurationHash (optional) configuration hash
1149
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1150
     * @return bool
1151
     */
1152 4
    public function addUrl(
1153
        $id,
1154
        $url,
1155
        array $subCfg,
1156
        $tstamp,
1157
        $configurationHash = '',
1158
        $skipInnerDuplicationCheck = false
1159
    ) {
1160 4
        $urlAdded = false;
1161 4
        $rows = [];
1162
1163
        // Creating parameters:
1164
        $parameters = [
1165 4
            'url' => $url
1166
        ];
1167
1168
        // fe user group simulation:
1169 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1170 4
        if ($uGs) {
1171
            $parameters['feUserGroupList'] = $uGs;
1172
        }
1173
1174
        // Setting processing instructions
1175 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1176 4
        if (is_array($subCfg['procInstrParams.'])) {
1177 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1178
        }
1179
1180
        // Possible TypoScript Template Parents
1181 4
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1182
1183
        // Compile value array:
1184 4
        $parameters_serialized = serialize($parameters);
1185
        $fieldArray = [
1186 4
            'page_id' => intval($id),
1187 4
            'parameters' => $parameters_serialized,
1188 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1189 4
            'configuration_hash' => $configurationHash,
1190 4
            'scheduled' => $tstamp,
1191 4
            'exec_time' => 0,
1192 4
            'set_id' => intval($this->setID),
1193 4
            'result_data' => '',
1194 4
            'configuration' => $subCfg['key'],
1195
        ];
1196
1197 4
        if ($this->registerQueueEntriesInternallyOnly) {
1198
            //the entries will only be registered and not stored to the database
1199
            $this->queueEntries[] = $fieldArray;
1200
        } else {
1201 4
            if (!$skipInnerDuplicationCheck) {
1202
                // check if there is already an equal entry
1203 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1204
            }
1205
1206 4
            if (count($rows) == 0) {
1207 4
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1208 4
                $uid = $this->db->sql_insert_id();
1209 4
                $rows[] = $uid;
1210 4
                $urlAdded = true;
1211 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1212
            } else {
1213 2
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1214
            }
1215
        }
1216
1217 4
        return $urlAdded;
1218
    }
1219
1220
    /**
1221
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1222
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1223
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1224
     *
1225
     * @param int $tstamp
1226
     * @param array $fieldArray
1227
     *
1228
     * @return array
1229
     */
1230 4
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1231
    {
1232 4
        $rows = [];
1233
1234 4
        $currentTime = $this->getCurrentTime();
1235
1236
        //if this entry is scheduled with "now"
1237 4
        if ($tstamp <= $currentTime) {
1238 1
            if ($this->extensionSettings['enableTimeslot']) {
1239 1
                $timeBegin = $currentTime - 100;
1240 1
                $timeEnd = $currentTime + 100;
1241 1
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1242
            } else {
1243 1
                $where = 'scheduled <= ' . $currentTime;
1244
            }
1245 3
        } elseif ($tstamp > $currentTime) {
1246
            //entry with a timestamp in the future need to have the same schedule time
1247 3
            $where = 'scheduled = ' . $tstamp ;
1248
        }
1249
1250 4
        if (!empty($where)) {
1251 4
            $result = $this->db->exec_SELECTgetRows(
1252 4
                'qid',
1253 4
                'tx_crawler_queue',
1254
                $where .
1255 4
                ' AND NOT exec_time' .
1256 4
                ' AND NOT process_id ' .
1257 4
                ' AND page_id=' . intval($fieldArray['page_id']) .
1258 4
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1259
            );
1260
1261 4
            if (is_array($result)) {
1262 4
                foreach ($result as $value) {
1263 2
                    $rows[] = $value['qid'];
1264
                }
1265
            }
1266
        }
1267
1268 4
        return $rows;
1269
    }
1270
1271
    /**
1272
     * Returns the current system time
1273
     *
1274
     * @return int
1275
     */
1276
    public function getCurrentTime()
1277
    {
1278
        return time();
1279
    }
1280
1281
    /************************************
1282
     *
1283
     * URL reading
1284
     *
1285
     ************************************/
1286
1287
    /**
1288
     * Read URL for single queue entry
1289
     *
1290
     * @param integer $queueId
1291
     * @param boolean $force If set, will process even if exec_time has been set!
1292
     * @return integer
1293
     */
1294
    public function readUrl($queueId, $force = false)
1295
    {
1296
        $ret = 0;
1297
        if ($this->debugMode) {
1298
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1299
        }
1300
        // Get entry:
1301
        list($queueRec) = $this->db->exec_SELECTgetRows(
1302
            '*',
1303
            'tx_crawler_queue',
1304
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1305
        );
1306
1307
        if (!is_array($queueRec)) {
1308
            return;
1309
        }
1310
1311
        $parameters = unserialize($queueRec['parameters']);
1312
        if ($parameters['rootTemplatePid']) {
1313
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1314
        } else {
1315
            GeneralUtility::sysLog(
1316
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1317
                'crawler',
1318
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1319
            );
1320
        }
1321
1322
        SignalSlotUtility::emitSignal(
1323
            __CLASS__,
1324
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1325
            [$queueId, $queueRec]
1326
        );
1327
1328
        // Set exec_time to lock record:
1329
        $field_array = ['exec_time' => $this->getCurrentTime()];
1330
1331
        if (isset($this->processID)) {
1332
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1333
            $field_array['process_id_completed'] = $this->processID;
1334
        }
1335
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1336
1337
        $result = $this->readUrl_exec($queueRec);
1338
        $resultData = unserialize($result['content']);
1339
1340
        //atm there's no need to point to specific pollable extensions
1341
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1342
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1343
                // only check the success value if the instruction is runnig
1344
                // it is important to name the pollSuccess key same as the procInstructions key
1345
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1346
                    $pollable,
1347
                        $resultData['parameters']['procInstructions']
1348
                )
1349
                ) {
1350
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1351
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1352
                    }
1353
                }
1354
            }
1355
        }
1356
1357
        // Set result in log which also denotes the end of the processing of this entry.
1358
        $field_array = ['result_data' => serialize($result)];
1359
1360
        SignalSlotUtility::emitSignal(
1361
            __CLASS__,
1362
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1363
            [$queueId, $field_array]
1364
        );
1365
1366
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1367
1368
        if ($this->debugMode) {
1369
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1370
        }
1371
1372
        return $ret;
1373
    }
1374
1375
    /**
1376
     * Read URL for not-yet-inserted log-entry
1377
     *
1378
     * @param array $field_array Queue field array,
1379
     *
1380
     * @return string
1381
     */
1382
    public function readUrlFromArray($field_array)
1383
    {
1384
1385
            // Set exec_time to lock record:
1386
        $field_array['exec_time'] = $this->getCurrentTime();
1387
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1388
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1389
1390
        $result = $this->readUrl_exec($field_array);
1391
1392
        // Set result in log which also denotes the end of the processing of this entry.
1393
        $field_array = ['result_data' => serialize($result)];
1394
1395
        SignalSlotUtility::emitSignal(
1396
            __CLASS__,
1397
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1398
            [$queueId, $field_array]
1399
        );
1400
1401
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1402
1403
        return $result;
1404
    }
1405
1406
    /**
1407
     * Read URL for a queue record
1408
     *
1409
     * @param array $queueRec Queue record
1410
     * @return string
1411
     */
1412
    public function readUrl_exec($queueRec)
1413
    {
1414
        // Decode parameters:
1415
        $parameters = unserialize($queueRec['parameters']);
1416
        $result = 'ERROR';
1417
        if (is_array($parameters)) {
1418
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1419
                $objRef = $parameters['_CALLBACKOBJ'];
1420
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1421
                if (is_object($callBackObj)) {
1422
                    unset($parameters['_CALLBACKOBJ']);
1423
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1424
                } else {
1425
                    $result = ['content' => 'No object: ' . $objRef];
1426
                }
1427
            } else { // Regular FE request:
1428
1429
                // Prepare:
1430
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1431
1432
                // Get result:
1433
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1434
1435
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1436
            }
1437
        }
1438
1439
        return $result;
1440
    }
1441
1442
    /**
1443
     * Gets the content of a URL.
1444
     *
1445
     * @param string $originalUrl URL to read
1446
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1447
     * @param integer $timeout Timeout time
1448
     * @param integer $recursion Recursion limiter for 302 redirects
1449
     * @return array
1450
     */
1451 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1452
    {
1453 2
        if (!$recursion) {
1454
            return false;
1455
        }
1456
1457
        // Parse URL, checking for scheme:
1458 2
        $url = parse_url($originalUrl);
1459
1460 2
        if ($url === false) {
1461
            if (TYPO3_DLOG) {
1462
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1463
            }
1464
            return false;
1465
        }
1466
1467 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1468
            if (TYPO3_DLOG) {
1469
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1470
            }
1471
            return false;
1472
        }
1473
1474
        // direct request
1475 2
        if ($this->extensionSettings['makeDirectRequests']) {
1476 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1477 2
            return $result;
1478
        }
1479
1480
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1481
1482
        // thanks to Pierrick Caillon for adding proxy support
1483
        $rurl = $url;
1484
1485
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1486
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1487
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1488
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1489
        }
1490
1491
        $host = $rurl['host'];
1492
1493
        if ($url['scheme'] == 'https') {
1494
            $host = 'ssl://' . $host;
1495
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1496
        } else {
1497
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1498
        }
1499
1500
        $startTime = microtime(true);
1501
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1502
1503
        if (!$fp) {
1504
            if (TYPO3_DLOG) {
1505
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1506
            }
1507
            return false;
1508
        } else {
1509
            // Request message:
1510
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1511
            fputs($fp, $msg);
1512
1513
            // Read response:
1514
            $d = $this->getHttpResponseFromStream($fp);
1515
            fclose($fp);
1516
1517
            $time = microtime(true) - $startTime;
1518
            $this->log($originalUrl . ' ' . $time);
1519
1520
            // Implode content and headers:
1521
            $result = [
1522
                'request' => $msg,
1523
                'headers' => implode('', $d['headers']),
1524
                'content' => implode('', (array)$d['content'])
1525
            ];
1526
1527
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1528
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1529
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1530
1531
                if (is_array($newRequestUrl)) {
1532
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1533
                } else {
1534
                    if (TYPO3_DLOG) {
1535
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1536
                    }
1537
                    return false;
1538
                }
1539
            }
1540
1541
            return $result;
1542
        }
1543
    }
1544
1545
    /**
1546
     * Gets the base path of the website frontend.
1547
     * (e.g. if you call http://mydomain.com/cms/index.php in
1548
     * the browser the base path is "/cms/")
1549
     *
1550
     * @return string Base path of the website frontend
1551
     */
1552
    protected function getFrontendBasePath()
1553
    {
1554
        $frontendBasePath = '/';
1555
1556
        // Get the path from the extension settings:
1557
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1558
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1559
            // If empty, try to use config.absRefPrefix:
1560
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1561
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1562
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1563
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1564
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1565
        }
1566
1567
        // Base path must be '/<pathSegements>/':
1568
        if ($frontendBasePath != '/') {
1569
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1570
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1571
        }
1572
1573
        return $frontendBasePath;
1574
    }
1575
1576
    /**
1577
     * Executes a shell command and returns the outputted result.
1578
     *
1579
     * @param string $command Shell command to be executed
1580
     * @return string Outputted result of the command execution
1581
     */
1582
    protected function executeShellCommand($command)
1583
    {
1584
        $result = shell_exec($command);
1585
        return $result;
1586
    }
1587
1588
    /**
1589
     * Reads HTTP response from the given stream.
1590
     *
1591
     * @param  resource $streamPointer  Pointer to connection stream.
1592
     * @return array                    Associative array with the following items:
1593
     *                                  headers <array> Response headers sent by server.
1594
     *                                  content <array> Content, with each line as an array item.
1595
     */
1596 1
    protected function getHttpResponseFromStream($streamPointer)
1597
    {
1598 1
        $response = ['headers' => [], 'content' => []];
1599
1600 1
        if (is_resource($streamPointer)) {
1601
            // read headers
1602 1
            while ($line = fgets($streamPointer, '2048')) {
1603 1
                $line = trim($line);
1604 1
                if ($line !== '') {
1605 1
                    $response['headers'][] = $line;
1606
                } else {
1607 1
                    break;
1608
                }
1609
            }
1610
1611
            // read content
1612 1
            while ($line = fgets($streamPointer, '2048')) {
1613 1
                $response['content'][] = $line;
1614
            }
1615
        }
1616
1617 1
        return $response;
1618
    }
1619
1620
    /**
1621
     * @param message
1622
     */
1623 2
    protected function log($message)
1624
    {
1625 2
        if (!empty($this->extensionSettings['logFileName'])) {
1626
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1627
            if (!$fileResult) {
1628
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1629
            }
1630
        }
1631 2
    }
1632
1633
    /**
1634
     * Builds HTTP request headers.
1635
     *
1636
     * @param array $url
1637
     * @param string $crawlerId
1638
     *
1639
     * @return array
1640
     */
1641 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1642
    {
1643 6
        $reqHeaders = [];
1644 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1645 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1646 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1647 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1648
        }
1649 6
        $reqHeaders[] = 'Connection: close';
1650 6
        if ($url['user'] != '') {
1651 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1652
        }
1653 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1654 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1655 6
        return $reqHeaders;
1656
    }
1657
1658
    /**
1659
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1660
     *
1661
     * @param array $headers HTTP Header
1662
     * @param string $user HTTP Auth. User
1663
     * @param string $pass HTTP Auth. Password
1664
     * @return bool|string
1665
     */
1666 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1667
    {
1668 12
        $header = [];
1669 12
        if (!is_array($headers)) {
1670 1
            return false;
1671
        }
1672 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1673 2
            return false;
1674
        }
1675
1676 9
        foreach ($headers as $hl) {
1677 9
            $tmp = explode(": ", $hl);
1678 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1679 9
            if (trim($tmp[0]) == 'Location') {
1680 9
                break;
1681
            }
1682
        }
1683 9
        if (!array_key_exists('Location', $header)) {
1684 3
            return false;
1685
        }
1686
1687 6
        if ($user != '') {
1688 3
            if (!($tmp = parse_url($header['Location']))) {
1689 1
                return false;
1690
            }
1691 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1692 2
            if ($tmp['query'] != '') {
1693 2
                $newUrl .= '?' . $tmp['query'];
1694
            }
1695
        } else {
1696 3
            $newUrl = $header['Location'];
1697
        }
1698 5
        return $newUrl;
1699
    }
1700
1701
    /**************************
1702
     *
1703
     * tslib_fe hooks:
1704
     *
1705
     **************************/
1706
1707
    /**
1708
     * Initialization hook (called after database connection)
1709
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1710
     *
1711
     * @param array $params Parameters from frontend
1712
     * @param object $ref TSFE object (reference under PHP5)
1713
     * @return void
1714
     *
1715
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1716
     * FIXME: I think this can be removed. (TNM)
1717
     */
1718
    public function fe_init(&$params, $ref)
1719
    {
1720
        // Authenticate crawler request:
1721
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1722
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1723
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1724
1725
            // If a crawler record was found and hash was matching, set it up:
1726
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1727
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1728
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1729
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1730
            } else {
1731
                die('No crawler entry found!');
1732
            }
1733
        }
1734
    }
1735
1736
    /*****************************
1737
     *
1738
     * Compiling URLs to crawl - tools
1739
     *
1740
     *****************************/
1741
1742
    /**
1743
     * @param integer $id Root page id to start from.
1744
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1745
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1746
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1747
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1748
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1749
     * @param array $incomingProcInstructions Array of processing instructions
1750
     * @param array $configurationSelection Array of configuration keys
1751
     * @return string
1752
     */
1753
    public function getPageTreeAndUrls(
1754
        $id,
1755
        $depth,
1756
        $scheduledTime,
1757
        $reqMinute,
1758
        $submitCrawlUrls,
1759
        $downloadCrawlUrls,
1760
        array $incomingProcInstructions,
1761
        array $configurationSelection
1762
    ) {
1763
        global $BACK_PATH;
1764
        global $LANG;
1765
        if (!is_object($LANG)) {
1766
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1767
            $LANG->init(0);
1768
        }
1769
        $this->scheduledTime = $scheduledTime;
1770
        $this->reqMinute = $reqMinute;
1771
        $this->submitCrawlUrls = $submitCrawlUrls;
1772
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1773
        $this->incomingProcInstructions = $incomingProcInstructions;
1774
        $this->incomingConfigurationSelection = $configurationSelection;
1775
1776
        $this->duplicateTrack = [];
1777
        $this->downloadUrls = [];
1778
1779
        // Drawing tree:
1780
        /* @var PageTreeView $tree */
1781
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1782
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1783
        $tree->init('AND ' . $perms_clause);
1784
1785
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1786
        if (is_array($pageInfo)) {
1787
            // Set root row:
1788
            $tree->tree[] = [
1789
                'row' => $pageInfo,
1790
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1791
            ];
1792
        }
1793
1794
        // Get branch beneath:
1795
        if ($depth) {
1796
            $tree->getTree($id, $depth, '');
1797
        }
1798
1799
        // Traverse page tree:
1800
        $code = '';
1801
1802
        foreach ($tree->tree as $data) {
1803
            $this->MP = false;
1804
1805
            // recognize mount points
1806
            if ($data['row']['doktype'] == 7) {
1807
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1808
1809
                // fetch mounted pages
1810
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
1811
1812
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1813
                $mountTree->init('AND ' . $perms_clause);
1814
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1815
1816
                foreach ($mountTree->tree as $mountData) {
1817
                    $code .= $this->drawURLs_addRowsForPage(
1818
                        $mountData['row'],
1819
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1820
                    );
1821
                }
1822
1823
                // replace page when mount_pid_ol is enabled
1824
                if ($mountpage[0]['mount_pid_ol']) {
1825
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1826
                } else {
1827
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1828
                    $this->MP = false;
1829
                }
1830
            }
1831
1832
            $code .= $this->drawURLs_addRowsForPage(
1833
                $data['row'],
1834
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1835
            );
1836
        }
1837
1838
        return $code;
1839
    }
1840
1841
    /**
1842
     * Expands exclude string
1843
     *
1844
     * @param string $excludeString Exclude string
1845
     * @return array
1846
     */
1847 1
    public function expandExcludeString($excludeString)
1848
    {
1849
        // internal static caches;
1850 1
        static $expandedExcludeStringCache;
1851 1
        static $treeCache;
1852
1853 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1854 1
            $pidList = [];
1855
1856 1
            if (!empty($excludeString)) {
1857
                /** @var PageTreeView $tree */
1858
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1859
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1860
1861
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1862
1863
                foreach ($excludeParts as $excludePart) {
1864
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1865
1866
                    // default is "page only" = "depth=0"
1867
                    if (empty($depth)) {
1868
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1869
                    }
1870
1871
                    $pidList[] = $pid;
1872
1873
                    if ($depth > 0) {
1874
                        if (empty($treeCache[$pid][$depth])) {
1875
                            $tree->reset();
1876
                            $tree->getTree($pid, $depth);
1877
                            $treeCache[$pid][$depth] = $tree->tree;
1878
                        }
1879
1880
                        foreach ($treeCache[$pid][$depth] as $data) {
1881
                            $pidList[] = $data['row']['uid'];
1882
                        }
1883
                    }
1884
                }
1885
            }
1886
1887 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1888
        }
1889
1890 1
        return $expandedExcludeStringCache[$excludeString];
1891
    }
1892
1893
    /**
1894
     * Create the rows for display of the page tree
1895
     * For each page a number of rows are shown displaying GET variable configuration
1896
     *
1897
     * @param    array        Page row
1898
     * @param    string        Page icon and title for row
1899
     * @return    string        HTML <tr> content (one or more)
1900
     */
1901
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1902
    {
1903
        $skipMessage = '';
1904
1905
        // Get list of configurations
1906
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1907
1908
        if (count($this->incomingConfigurationSelection) > 0) {
1909
            // remove configuration that does not match the current selection
1910
            foreach ($configurations as $confKey => $confArray) {
1911
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1912
                    unset($configurations[$confKey]);
1913
                }
1914
            }
1915
        }
1916
1917
        // Traverse parameter combinations:
1918
        $c = 0;
1919
        $content = '';
1920
        if (count($configurations)) {
1921
            foreach ($configurations as $confKey => $confArray) {
1922
1923
                    // Title column:
1924
                if (!$c) {
1925
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1926
                } else {
1927
                    $titleClm = '';
1928
                }
1929
1930
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1931
1932
                        // URL list:
1933
                    $urlList = $this->urlListFromUrlArray(
1934
                        $confArray,
1935
                        $pageRow,
1936
                        $this->scheduledTime,
1937
                        $this->reqMinute,
1938
                        $this->submitCrawlUrls,
1939
                        $this->downloadCrawlUrls,
1940
                        $this->duplicateTrack,
1941
                        $this->downloadUrls,
1942
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1943
                    );
1944
1945
                    // Expanded parameters:
1946
                    $paramExpanded = '';
1947
                    $calcAccu = [];
1948
                    $calcRes = 1;
1949
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1950
                        $paramExpanded .= '
1951
                            <tr>
1952
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1953
                                                '(' . count($gVal) . ')' .
1954
                                                '</td>
1955
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1956
                            </tr>
1957
                        ';
1958
                        $calcRes *= count($gVal);
1959
                        $calcAccu[] = count($gVal);
1960
                    }
1961
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1962
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1963
1964
                    // Options
1965
                    $optionValues = '';
1966
                    if ($confArray['subCfg']['userGroups']) {
1967
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1968
                    }
1969
                    if ($confArray['subCfg']['baseUrl']) {
1970
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
1971
                    }
1972
                    if ($confArray['subCfg']['procInstrFilter']) {
1973
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1974
                    }
1975
1976
                    // Compile row:
1977
                    $content .= '
1978
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1979
                            ' . $titleClm . '
1980
                            <td>' . htmlspecialchars($confKey) . '</td>
1981
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1982
                            <td>' . $paramExpanded . '</td>
1983
                            <td nowrap="nowrap">' . $urlList . '</td>
1984
                            <td nowrap="nowrap">' . $optionValues . '</td>
1985
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1986
                        </tr>';
1987
                } else {
1988
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1989
                            ' . $titleClm . '
1990
                            <td>' . htmlspecialchars($confKey) . '</td>
1991
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1992
                        </tr>';
1993
                }
1994
1995
                $c++;
1996
            }
1997
        } else {
1998
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1999
2000
            // Compile row:
2001
            $content .= '
2002
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2003
                    <td>' . $pageTitleAndIcon . '</td>
2004
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2005
                </tr>';
2006
        }
2007
2008
        return $content;
2009
    }
2010
2011
    /*****************************
2012
     *
2013
     * CLI functions
2014
     *
2015
     *****************************/
2016
2017
    /**
2018
     * Main function for running from Command Line PHP script (cron job)
2019
     * See ext/crawler/cli/crawler_cli.phpsh for details
2020
     *
2021
     * @return int number of remaining items or false if error
2022
     */
2023
    public function CLI_main()
2024
    {
2025
        $this->setAccessMode('cli');
2026
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2027
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2028
2029
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2030
            $cliObj->cli_validateArgs();
2031
            $cliObj->cli_help();
2032
            exit;
2033
        }
2034
2035
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2036
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2037
            // Seconds
2038
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2039
            // Milliseconds
2040
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2041
2042
            try {
2043
                // Run process:
2044
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2045
            } catch (\Exception $e) {
2046
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2047
                $result = self::CLI_STATUS_ABORTED;
2048
            }
2049
2050
            // Cleanup
2051
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2052
2053
            //TODO can't we do that in a clean way?
2054
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
2055
2056
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2057
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2058
        } else {
2059
            $result |= self::CLI_STATUS_ABORTED;
2060
        }
2061
2062
        return $result;
2063
    }
2064
2065
    /**
2066
     * Function executed by crawler_im.php cli script.
2067
     *
2068
     * @return void
2069
     */
2070
    public function CLI_main_im()
2071
    {
2072
        $this->setAccessMode('cli_im');
2073
2074
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2075
2076
        // Force user to admin state and set workspace to "Live":
2077
        $this->backendUser->user['admin'] = 1;
2078
        $this->backendUser->setWorkspace(0);
2079
2080
        // Print help
2081
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2082
            $cliObj->cli_validateArgs();
2083
            $cliObj->cli_help();
2084
            exit;
2085
        }
2086
2087
        $cliObj->cli_validateArgs();
2088
2089
        if ($cliObj->cli_argValue('-o') === 'exec') {
2090
            $this->registerQueueEntriesInternallyOnly = true;
2091
        }
2092
2093
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2094
            // Crawler is called over TYPO3 BE
2095
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2096
        } else {
2097
            // Crawler is called over cli
2098
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2099
        }
2100
2101
        $configurationKeys = $this->getConfigurationKeys($cliObj);
2102
2103
        if (!is_array($configurationKeys)) {
2104
            $configurations = $this->getUrlsForPageId($pageId);
2105
            if (is_array($configurations)) {
2106
                $configurationKeys = array_keys($configurations);
2107
            } else {
2108
                $configurationKeys = [];
2109
            }
2110
        }
2111
2112
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2113
            $reason = new Reason();
2114
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2115
            $reason->setDetailText('The cli script of the crawler added to the queue');
2116
            EventDispatcher::getInstance()->post(
2117
                'invokeQueueChange',
2118
                $this->setID,
2119
                ['reason' => $reason]
2120
            );
2121
        }
2122
2123
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2124
            $this->cleanUpOldQueueEntries();
2125
        }
2126
2127
        $this->setID = (int) GeneralUtility::md5int(microtime());
2128
        $this->getPageTreeAndUrls(
2129
            $pageId,
2130
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2131
            $this->getCurrentTime(),
2132
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2133
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2134
            $cliObj->cli_argValue('-o') === 'url',
2135
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2136
            $configurationKeys
2137
        );
2138
2139
        if ($cliObj->cli_argValue('-o') === 'url') {
2140
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2141
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2142
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2143
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2144
            $cliObj->cli_echo("\nProcessing:\n");
2145
2146
            foreach ($this->queueEntries as $queueRec) {
2147
                $p = unserialize($queueRec['parameters']);
2148
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2149
2150
                $result = $this->readUrlFromArray($queueRec);
2151
2152
                $requestResult = unserialize($result['content']);
2153
                if (is_array($requestResult)) {
2154
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2155
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2156
                } else {
2157
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2158
                }
2159
            }
2160
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2161
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2162
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2163
        } else {
2164
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2165
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2166
        }
2167
    }
2168
2169
    /**
2170
     * Function executed by crawler_im.php cli script.
2171
     *
2172
     * @return bool
2173
     */
2174
    public function CLI_main_flush()
2175
    {
2176
        $this->setAccessMode('cli_flush');
2177
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2178
2179
        // Force user to admin state and set workspace to "Live":
2180
        $this->backendUser->user['admin'] = 1;
2181
        $this->backendUser->setWorkspace(0);
2182
2183
        // Print help
2184
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2185
            $cliObj->cli_validateArgs();
2186
            $cliObj->cli_help();
2187
            exit;
2188
        }
2189
2190
        $cliObj->cli_validateArgs();
2191
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2192
        $fullFlush = ($pageId == 0);
2193
2194
        $mode = $cliObj->cli_argValue('-o');
2195
2196
        switch ($mode) {
2197
            case 'all':
2198
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2199
                break;
2200
            case 'finished':
2201
            case 'pending':
2202
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2203
                break;
2204
            default:
2205
                $cliObj->cli_validateArgs();
2206
                $cliObj->cli_help();
2207
                $result = false;
2208
        }
2209
2210
        return $result !== false;
2211
    }
2212
2213
    /**
2214
     * Obtains configuration keys from the CLI arguments
2215
     *
2216
     * @param QueueCommandLineController $cliObj
2217
     * @return array
2218
     *
2219
     * @deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
2220
     */
2221
    protected function getConfigurationKeys(QueueCommandLineController $cliObj)
2222
    {
2223
        $parameter = trim($cliObj->cli_argValue('-conf'));
2224
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2225
    }
2226
2227
    /**
2228
     * Running the functionality of the CLI (crawling URLs from queue)
2229
     *
2230
     * @param int $countInARun
2231
     * @param int $sleepTime
2232
     * @param int $sleepAfterFinish
2233
     * @return string
2234
     */
2235
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2236
    {
2237
        $result = 0;
2238
        $counter = 0;
2239
2240
        // First, run hooks:
2241
        $this->CLI_runHooks();
2242
2243
        // Clean up the queue
2244
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2245
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2246
            $del = $this->db->exec_DELETEquery(
2247
                'tx_crawler_queue',
2248
                'exec_time!=0 AND exec_time<' . $purgeDate
2249
            );
2250
            if (false == $del) {
2251
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2252
            }
2253
        }
2254
2255
        // Select entries:
2256
        //TODO Shouldn't this reside within the transaction?
2257
        $rows = $this->db->exec_SELECTgetRows(
2258
            'qid,scheduled',
2259
            'tx_crawler_queue',
2260
            'exec_time=0
2261
                AND process_scheduled= 0
2262
                AND scheduled<=' . $this->getCurrentTime(),
2263
            '',
2264
            'scheduled, qid',
2265
        intval($countInARun)
2266
        );
2267
2268
        if (count($rows) > 0) {
2269
            $quidList = [];
2270
2271
            foreach ($rows as $r) {
2272
                $quidList[] = $r['qid'];
2273
            }
2274
2275
            $processId = $this->CLI_buildProcessId();
2276
2277
            //reserve queue entries for process
2278
            $this->db->sql_query('BEGIN');
2279
            //TODO make sure we're not taking assigned queue-entires
2280
            $this->db->exec_UPDATEquery(
2281
                'tx_crawler_queue',
2282
                'qid IN (' . implode(',', $quidList) . ')',
2283
                [
2284
                    'process_scheduled' => intval($this->getCurrentTime()),
2285
                    'process_id' => $processId
2286
                ]
2287
            );
2288
2289
            //save the number of assigned queue entrys to determine who many have been processed later
2290
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2291
            $this->db->exec_UPDATEquery(
2292
                'tx_crawler_process',
2293
                "process_id = '" . $processId . "'",
2294
                [
2295
                    'assigned_items_count' => intval($numberOfAffectedRows)
2296
                ]
2297
            );
2298
2299
            if ($numberOfAffectedRows == count($quidList)) {
2300
                $this->db->sql_query('COMMIT');
2301
            } else {
2302
                $this->db->sql_query('ROLLBACK');
2303
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2304
                return ($result | self::CLI_STATUS_ABORTED);
2305
            }
2306
2307
            foreach ($rows as $r) {
2308
                $result |= $this->readUrl($r['qid']);
2309
2310
                $counter++;
2311
                usleep(intval($sleepTime)); // Just to relax the system
2312
2313
                // if during the start and the current read url the cli has been disable we need to return from the function
2314
                // mark the process NOT as ended.
2315
                if ($this->getDisabled()) {
2316
                    return ($result | self::CLI_STATUS_ABORTED);
2317
                }
2318
2319
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2320
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2321
2322
                    //TODO might need an additional returncode
2323
                    $result |= self::CLI_STATUS_ABORTED;
2324
                    break; //possible timeout
2325
                }
2326
            }
2327
2328
            sleep(intval($sleepAfterFinish));
2329
2330
            $msg = 'Rows: ' . $counter;
2331
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2332
        } else {
2333
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2334
        }
2335
2336
        if ($counter > 0) {
2337
            $result |= self::CLI_STATUS_PROCESSED;
2338
        }
2339
2340
        return $result;
2341
    }
2342
2343
    /**
2344
     * Activate hooks
2345
     *
2346
     * @return void
2347
     */
2348
    public function CLI_runHooks()
2349
    {
2350
        global $TYPO3_CONF_VARS;
2351
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2352
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2353
                $hookObj = &GeneralUtility::getUserObj($objRef);
2354
                if (is_object($hookObj)) {
2355
                    $hookObj->crawler_init($this);
2356
                }
2357
            }
2358
        }
2359
    }
2360
2361
    /**
2362
     * Try to acquire a new process with the given id
2363
     * also performs some auto-cleanup for orphan processes
2364
     * @todo preemption might not be the most elegant way to clean up
2365
     *
2366
     * @param string $id identification string for the process
2367
     * @return boolean
2368
     */
2369
    public function CLI_checkAndAcquireNewProcess($id)
2370
    {
2371
        $ret = true;
2372
2373
        $systemProcessId = getmypid();
2374
        if ($systemProcessId < 1) {
2375
            return false;
2376
        }
2377
2378
        $processCount = 0;
2379
        $orphanProcesses = [];
2380
2381
        $this->db->sql_query('BEGIN');
2382
2383
        $res = $this->db->exec_SELECTquery(
2384
            'process_id,ttl',
2385
            'tx_crawler_process',
2386
            'active=1 AND deleted=0'
2387
            );
2388
2389
        $currentTime = $this->getCurrentTime();
2390
2391
        while ($row = $this->db->sql_fetch_assoc($res)) {
2392
            if ($row['ttl'] < $currentTime) {
2393
                $orphanProcesses[] = $row['process_id'];
2394
            } else {
2395
                $processCount++;
2396
            }
2397
        }
2398
2399
        // if there are less than allowed active processes then add a new one
2400
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2401
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2402
2403
            // create new process record
2404
            $this->db->exec_INSERTquery(
2405
                'tx_crawler_process',
2406
                [
2407
                    'process_id' => $id,
2408
                    'active' => '1',
2409
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2410
                    'system_process_id' => $systemProcessId
2411
                ]
2412
                );
2413
        } else {
2414
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2415
            $ret = false;
2416
        }
2417
2418
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2419
        $this->CLI_deleteProcessesMarkedDeleted();
2420
2421
        $this->db->sql_query('COMMIT');
2422
2423
        return $ret;
2424
    }
2425
2426
    /**
2427
     * Release a process and the required resources
2428
     *
2429
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2430
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2431
     * @return boolean
2432
     */
2433
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2434
    {
2435
        if (!is_array($releaseIds)) {
2436
            $releaseIds = [$releaseIds];
2437
        }
2438
2439
        if (!count($releaseIds) > 0) {
2440
            return false;   //nothing to release
2441
        }
2442
2443
        if (!$withinLock) {
2444
            $this->db->sql_query('BEGIN');
2445
        }
2446
2447
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2448
        // this ensures that a single process can't mess up the entire process table
2449
2450
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2451
        $this->db->exec_UPDATEquery(
2452
            'tx_crawler_queue',
2453
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2454
            [
2455
                'process_scheduled' => 0,
2456
                'process_id' => ''
2457
            ]
2458
        );
2459
        $this->db->exec_UPDATEquery(
2460
            'tx_crawler_process',
2461
            'active=0 AND deleted=0
2462
            AND NOT EXISTS (
2463
                SELECT * FROM tx_crawler_queue
2464
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2465
                AND tx_crawler_queue.exec_time = 0
2466
            )',
2467
            [
2468
                'deleted' => '1',
2469
                'system_process_id' => 0
2470
            ]
2471
        );
2472
        // mark all requested processes as non-active
2473
        $this->db->exec_UPDATEquery(
2474
            'tx_crawler_process',
2475
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2476
            [
2477
                'active' => '0'
2478
            ]
2479
        );
2480
        $this->db->exec_UPDATEquery(
2481
            'tx_crawler_queue',
2482
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2483
            [
2484
                'process_scheduled' => 0,
2485
                'process_id' => ''
2486
            ]
2487
        );
2488
2489
        if (!$withinLock) {
2490
            $this->db->sql_query('COMMIT');
2491
        }
2492
2493
        return true;
2494
    }
2495
2496
    /**
2497
     * Delete processes marked as deleted
2498
     *
2499
     * @return void
2500
     */
2501 1
    public function CLI_deleteProcessesMarkedDeleted()
2502
    {
2503 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2504 1
    }
2505
2506
    /**
2507
     * Check if there are still resources left for the process with the given id
2508
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2509
     *
2510
     * @param  string  identification string for the process
2511
     * @return boolean determines if the process is still active / has resources
2512
     *
2513
     * FIXME: Please remove Transaction, not needed as only a select query.
2514
     */
2515
    public function CLI_checkIfProcessIsActive($pid)
2516
    {
2517
        $ret = false;
2518
        $this->db->sql_query('BEGIN');
2519
        $res = $this->db->exec_SELECTquery(
2520
            'process_id,active,ttl',
2521
            'tx_crawler_process',
2522
            'process_id = \'' . $pid . '\'  AND deleted=0',
2523
            '',
2524
            'ttl',
2525
            '0,1'
2526
        );
2527
        if ($row = $this->db->sql_fetch_assoc($res)) {
2528
            $ret = intVal($row['active']) == 1;
2529
        }
2530
        $this->db->sql_query('COMMIT');
2531
2532
        return $ret;
2533
    }
2534
2535
    /**
2536
     * Create a unique Id for the current process
2537
     *
2538
     * @return string  the ID
2539
     */
2540 2
    public function CLI_buildProcessId()
2541
    {
2542 2
        if (!$this->processID) {
2543 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2544
        }
2545 2
        return $this->processID;
2546
    }
2547
2548
    /**
2549
     * @param bool $get_as_float
2550
     *
2551
     * @return mixed
2552
     */
2553
    protected function microtime($get_as_float = false)
2554
    {
2555
        return microtime($get_as_float);
2556
    }
2557
2558
    /**
2559
     * Prints a message to the stdout (only if debug-mode is enabled)
2560
     *
2561
     * @param  string $msg  the message
2562
     */
2563
    public function CLI_debug($msg)
2564
    {
2565
        if (intval($this->extensionSettings['processDebug'])) {
2566
            echo $msg . "\n";
2567
            flush();
2568
        }
2569
    }
2570
2571
    /**
2572
     * Get URL content by making direct request to TYPO3.
2573
     *
2574
     * @param  string $url          Page URL
2575
     * @param  int    $crawlerId    Crawler-ID
2576
     * @return array
2577
     */
2578 2
    protected function sendDirectRequest($url, $crawlerId)
2579
    {
2580 2
        $parsedUrl = parse_url($url);
2581 2
        if (!is_array($parsedUrl)) {
2582
            return [];
2583
        }
2584
2585 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2586
2587 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2588 2
        $cmd .= ' ';
2589 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2590 2
        $cmd .= ' ';
2591 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2592 2
        $cmd .= ' ';
2593 2
        $cmd .= escapeshellarg($url);
2594 2
        $cmd .= ' ';
2595 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2596
2597 2
        $startTime = microtime(true);
2598 2
        $content = $this->executeShellCommand($cmd);
2599 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2600
2601
        $result = [
2602 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2603 2
            'headers' => '',
2604 2
            'content' => $content
2605
        ];
2606
2607 2
        return $result;
2608
    }
2609
2610
    /**
2611
     * Cleans up entries that stayed for too long in the queue. These are:
2612
     * - processed entries that are over 1.5 days in age
2613
     * - scheduled entries that are over 7 days old
2614
     *
2615
     * @return void
2616
     *
2617
     * TODO: Should be switched back to protected - TNM 2018-11-16
2618
     */
2619
    public function cleanUpOldQueueEntries()
2620
    {
2621
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2622
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2623
2624
        $now = time();
2625
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2626
        $this->flushQueue($condition);
2627
    }
2628
2629
    /**
2630
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2631
     *
2632
     * @param int $id
2633
     * @param int $typeNum
2634
     *
2635
     * @return void
2636
     */
2637
    protected function initTSFE($id = 1, $typeNum = 0)
2638
    {
2639
        EidUtility::initTCA();
2640
        if (!is_object($GLOBALS['TT'])) {
2641
            $GLOBALS['TT'] = new NullTimeTracker();
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2642
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2643
        }
2644
2645
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2646
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2647
        $GLOBALS['TSFE']->sys_page->init(true);
2648
        $GLOBALS['TSFE']->connectToDB();
2649
        $GLOBALS['TSFE']->initFEuser();
2650
        $GLOBALS['TSFE']->determineId();
2651
        $GLOBALS['TSFE']->initTemplate();
2652
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2653
        $GLOBALS['TSFE']->getConfigArray();
2654
        PageGenerator::pagegenInit();
2655
    }
2656
2657
    /**
2658
     * Returns a md5 hash generated from a serialized configuration array.
2659
     *
2660
     * @param array $configuration
2661
     *
2662
     * @return string
2663
     */
2664 9
    protected function getConfigurationHash(array $configuration) {
2665 9
        unset($configuration['paramExpanded']);
2666 9
        unset($configuration['URLs']);
2667 9
        return md5(serialize($configuration));
2668
    }
2669
2670
    /**
2671
     * Check whether the Crawling Protocol should be http or https
2672
     *
2673
     * @param $crawlerConfiguration
2674
     * @param $pageConfiguration
2675
     *
2676
     * @return bool
2677
     */
2678 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2679
        switch($crawlerConfiguration) {
2680 6
            case -1:
2681 1
                return false;
2682 5
            case 0:
2683 3
                return $pageConfiguration;
2684 2
            case 1:
2685 1
                return true;
2686
            default:
2687 1
                return false;
2688
        }
2689
    }
2690
}
2691