Completed
Push — issue/322 ( 6d8e95 )
by Tomas Norre
22:59
created

CrawlerCommandController::getConfigurationKeys()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 1
dl 0
loc 5
ccs 0
cts 5
cp 0
crap 6
rs 10
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Command;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Model\Reason;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use Helhum\Typo3Console\Mvc\Controller\CommandController;
33
use TYPO3\CMS\Core\Utility\GeneralUtility;
34
use TYPO3\CMS\Core\Utility\MathUtility;
35
use TYPO3\CMS\Extbase\Object\ObjectManager;
36
37
/**
38
 * Class CrawlerCommandController
39
 */
40
class CrawlerCommandController extends CommandController
41
{
42
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
43
    const CLI_STATUS_REMAIN = 1; //queue not empty
44
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
45
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
46
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
47
48
    /**
49
     * Crawler Command - Cleaning up the queue.
50
     *
51
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
52
     * It will remove queue entries and perform a cleanup.
53
     *
54
     * Examples:
55
     *
56
     * --- Remove all finished queue-entries in the sub-branch of page 5
57
     * $ typo3cms crawler:flushqueue --mode finished --page 5
58
     *
59
     * --- Remove all pending queue-entries for all pages
60
     * $ typo3cms crawler:flushqueue --mode pending
61
     *
62
     * @param string $mode Output mode: "finished", "all", "pending"', "Specifies the type queue entries which is flushed in the process."
63
     * @param int $page Page to start clearing the queue recursively, 0 is default and clears all.
64
     *
65
     */
66
    public function flushQueueCommand($mode = 'finished', $page = 0)
67
    {
68
        /** @var CrawlerController $crawlerController */
69
        $crawlerController = $this->objectManager->get(CrawlerController::class);
70
71
        $pageId = MathUtility::forceIntegerInRange($page, 0);
72
        $fullFlush = ($pageId == 0);
73
74
        switch ($mode) {
75
            case 'all':
76
                $crawlerController->getLogEntriesForPageId($pageId, '', true, $fullFlush);
77
                break;
78
            case 'finished':
79
                $crawlerController->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
80
                break;
81
            case 'pending':
82
                $crawlerController->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
83
                break;
84
            default:
85
                $this->outputLine("<info>No matching parameters found." . chr(10) . "Try 'typo3cms help crawler:flushqueue' to see your options</info>");
86
                break;
87
88
        }
89
    }
90
91
    /**
92
     * Crawler Command - Submitting URLs to be crawled.
93
     *
94
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
95
     * It can put entries in the queue from command line options, return the list of URLs and even execute
96
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
97
     * re-indexing or static publishing from command line.
98
     *
99
     * Examples:
100
     *
101
     * --- Re-cache pages from page 7 and two levels down, executed immediately
102
     * $ typo3cms crawler:buildqueue --startpage 7 --depth 2 --conf <configurationKey> --mode exec
103
     *
104
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
105
     * $ typo3cms crawler:buildqueue --startpage 7 --depth 0 --conf <configurationKey> --number 4 --mode queue
106
     *
107
     * @param int $startpage The page from where the queue building should start.
108
     * @param int $depth Tree depth, 0-99', "How many levels under the 'page_id' to include.
109
     * @param string $mode Output mode: "url", "exec", "queue"', "Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!
110
     * @param int $number Specifies how many items are put in the queue per minute. Only valid for output mode "queue"
111
     * @param string $conf A comma separated list of crawler configurations
112
     */
113
    public function buildQueueCommand($startpage = 0, $depth = 0, $mode = '', $number = 0, $conf = '')
114
    {
115
116
        /** @var CrawlerController $crawlerController */
117
        $crawlerController = $this->objectManager->get(CrawlerController::class);
118
119
        // Force user to admin state and set workspace to "Live":
120
//        $this->backendUser->user['admin'] = 1;
121
//        $this->backendUser->setWorkspace(0);
122
123
        if ($mode === 'exec') {
124
            $crawlerController->registerQueueEntriesInternallyOnly = true;
125
        }
126
127
        if (1===2) {
128
            // Crawler is called over TYPO3 BE
129
            $pageId = MathUtility::forceIntegerInRange(1, 0);
130
        } else {
131
            // Crawler is called over cli
132
            $pageId = MathUtility::forceIntegerInRange($startpage, 0);
133
        }
134
135
        $configurationKeys = $this->getConfigurationKeys($conf);
136
137
        if (!is_array($configurationKeys)) {
138
            $configurations = $crawlerController->getUrlsForPageId($pageId);
139
            if (is_array($configurations)) {
140
                $configurationKeys = array_keys($configurations);
141
            } else {
142
                $configurationKeys = [];
143
            }
144
        }
145
146
        if ($mode === 'queue' || $mode === 'exec') {
147
            $reason = new Reason();
148
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
149
            $reason->setDetailText('The cli script of the crawler added to the queue');
150
            EventDispatcher::getInstance()->post(
151
                'invokeQueueChange',
152
                $crawlerController->setID,
153
                ['reason' => $reason]
154
            );
155
        }
156
157
        if ($crawlerController->extensionSettings['cleanUpOldQueueEntries']) {
158
            $crawlerController->cleanUpOldQueueEntries();
159
        }
160
161
        $crawlerController->setID = (int) GeneralUtility::md5int(microtime());
162
        $crawlerController->getPageTreeAndUrls(
163
            $pageId,
164
            MathUtility::forceIntegerInRange($depth, 0, 99),
165
            $crawlerController->getCurrentTime(),
166
            MathUtility::forceIntegerInRange($number ? intval($number) : 30, 1, 1000),
167
            $mode === 'queue' || $mode === 'exec',
168
            $mode === 'url',
169
            [],
170
            $configurationKeys
171
        );
172
173
        if ($mode === 'url') {
174
            $this->outputLine("<info>" . implode(chr(10), $crawlerController->downloadUrls) . chr(10) . "</info>");
175
        } elseif ($mode === 'exec') {
176
            $this->outputLine("<info>Executing " . count($crawlerController->urlList) . " requests right away:</info>");
177
            $this->outputLine("<info>" . implode(chr(10), $crawlerController->urlList) . "</info>" .chr(10));
178
            $this->outputLine("<info>Processing</info>" . chr(10));
179
180
            foreach ($crawlerController->queueEntries as $queueRec) {
181
                $p = unserialize($queueRec['parameters']);
182
                $this->outputLine("<info>" . $p['url'] . " (" . implode(',', $p['procInstructions']) . ") => " . "</info>" . chr(10));
183
                $result = $crawlerController->readUrlFromArray($queueRec);
184
185
                $requestResult = unserialize($result['content']);
186
                if (is_array($requestResult)) {
187
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
188
                    $this->outputLine("<info>OK: " . $resLog . "</info>" . chr(10));
189
                } else {
190
                    $this->outputLine("<errror>Error checking Crawler Result:  " . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10) . "</errror>" . chr(10));
191
                }
192
            }
193
        } elseif ($mode === 'queue') {
194
            $this->outputLine("<info>Putting " . count($crawlerController->urlList) . " entries in queue:</info>" . chr(10));
195
            $this->outputLine("<info>" . implode(chr(10), $crawlerController->urlList) . "</info>" . chr(10));
196
        } else {
197
            $this->outputLine("<info>" . count($crawlerController->urlList) . " entries found for processing. (Use --mode to decide action):</info>" . chr(10));
198
            $this->outputLine("<info>" . implode(chr(10), $crawlerController->urlList) . "</info>" . chr(10));
199
        }
200
    }
201
202
    /**
203
     * Crawler Command - Crawling the URLs from the queue
204
     *
205
     * Examples:
206
     *
207
     * --- Will trigger the crawler which starts to process the queue entires
208
     * $ typo3cms crawler:crawlqueue
209
     *
210
     * @param int $amount How many pages should be crawled during that run.
211
     * @param int $sleeptime Amount of milliseconds which the system should use to relax between crawls.
212
     * @param int $sleepafter Amount of seconds which the system should use to relax after all crawls are done.
213
     *
214
     * @return int
215
     */
216
    public function crawlQueueCommand($amount = 0, $sleeptime = 0, $sleepafter = 0)
217
    {
218
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
219
220
        /** @var CrawlerController $crawlerController */
221
        $crawlerController = $this->objectManager->get(CrawlerController::class);
222
        $queueRepository = $this->objectManager->get(QueueRepository::class);
223
224
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
225
        $settings = is_array($settings) ? $settings : [];
226
        $crawlerController->setExtensionSettings($settings);
227
228
        if (!$crawlerController->getDisabled() && $crawlerController->CLI_checkAndAcquireNewProcess($crawlerController->CLI_buildProcessId())) {
229
            $countInARun = $amount ? intval($amount) : $crawlerController->extensionSettings['countInARun'];
230
            $sleepAfterFinish = $sleeptime ? intval($sleeptime) : $crawlerController->extensionSettings['sleepAfterFinish'];
231
            $sleepTime = $sleepafter ? intval($sleepafter) : $crawlerController->extensionSettings['sleepTime'];
232
233
            try {
234
                // Run process:
235
                $result = $crawlerController->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
236
            } catch (\Exception $e) {
237
                $this->outputLine("<warning>". get_class($e) . ': ' . $e->getMessage() . "</warning>");
238
                $result = self::CLI_STATUS_ABORTED;
239
            }
240
241
            // Cleanup
242
            $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
243
            $crawlerController->CLI_releaseProcesses($crawlerController->CLI_buildProcessId());
244
245
            $this->outputLine("<info>Unprocessed Items remaining:" . $queueRepository->countUnprocessedItems() . " (" . $crawlerController->CLI_buildProcessId() . ")</info>");
246
            $result |= ($queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
247
        } else {
248
            $result |= self::CLI_STATUS_ABORTED;
249
        }
250
251
        return $result;
252
253
    }
254
255
    /**
256
     * Obtains configuration keys from the CLI arguments
257
     *
258
     * @param $conf string
259
     * @return array
260
     */
261
    private function getConfigurationKeys($conf)
262
    {
263
        $parameter = trim($conf);
264
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
265
    }
266
}