Completed
Push — master ( 7ea2e2...fb7f1a )
by Tomas Norre
06:45
created

CrawlerCommandController::flushQueueCommand()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 22

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
nc 4
nop 2
dl 0
loc 22
ccs 0
cts 18
cp 0
crap 20
rs 9.568
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Command;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Model\Reason;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use AOE\Crawler\Event\EventDispatcher;
32
use Helhum\Typo3Console\Mvc\Controller\CommandController;
33
use TYPO3\CMS\Core\Utility\GeneralUtility;
34
use TYPO3\CMS\Core\Utility\MathUtility;
35
use TYPO3\CMS\Extbase\Object\ObjectManager;
36
37
/**
38
 * Class CrawlerCommandController
39
 */
40
class CrawlerCommandController extends CommandController
41
{
42
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
43
    const CLI_STATUS_REMAIN = 1; //queue not empty
44
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
45
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
46
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
47
48
    /**
49
     * Crawler Command - Cleaning up the queue.
50
     *
51
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
52
     * It will remove queue entries and perform a cleanup.
53
     *
54
     * Examples:
55
     *
56
     * --- Remove all finished queue-entries in the sub-branch of page 5
57
     * $ typo3cms crawler:flushqueue --mode finished --page 5
58
     *
59
     * --- Remove all pending queue-entries for all pages
60
     * $ typo3cms crawler:flushqueue --mode pending
61
     *
62
     * @param string $mode Output mode: "finished", "all", "pending"', "Specifies the type queue entries which is flushed in the process."
63
     * @param int $page Page to start clearing the queue recursively, 0 is default and clears all.
64
     *
65
     */
66
    public function flushQueueCommand($mode = 'finished', $page = 0)
67
    {
68
        /** @var CrawlerController $crawlerController */
69
        $crawlerController = $this->objectManager->get(CrawlerController::class);
70
71
        $pageId = MathUtility::forceIntegerInRange($page, 0);
72
        $fullFlush = ($pageId == 0);
73
74
        switch ($mode) {
75
            case 'all':
76
                $crawlerController->getLogEntriesForPageId($pageId, '', true, $fullFlush);
77
                break;
78
            case 'finished':
79
            case 'pending':
80
                $crawlerController->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
81
                break;
82
            default:
83
                $this->outputLine('<info>No matching parameters found.' . PHP_EOL . 'Try "typo3cms help crawler:flushqueue" to see your options</info>');
84
                break;
85
86
        }
87
    }
88
89
    /**
90
     * Crawler Command - Submitting URLs to be crawled.
91
     *
92
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
93
     * It can put entries in the queue from command line options, return the list of URLs and even execute
94
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
95
     * re-indexing or static publishing from command line.
96
     *
97
     * Examples:
98
     *
99
     * --- Re-cache pages from page 7 and two levels down, executed immediately
100
     * $ typo3cms crawler:buildqueue --startpage 7 --depth 2 --conf <configurationKey> --mode exec
101
     *
102
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
103
     * $ typo3cms crawler:buildqueue --startpage 7 --depth 0 --conf <configurationKey> --number 4 --mode queue
104
     *
105
     * @param int $startpage The page from where the queue building should start.
106
     * @param int $depth Tree depth, 0-99', "How many levels under the 'page_id' to include.
107
     * @param string $mode Output mode: "url", "exec", "queue"', "Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!
108
     * @param int $number Specifies how many items are put in the queue per minute. Only valid for output mode "queue"
109
     * @param string $conf A comma separated list of crawler configurations
110
     */
111
    public function buildQueueCommand($startpage = 0, $depth = 0, $mode = '', $number = 0, $conf = '')
112
    {
113
114
        /** @var CrawlerController $crawlerController */
115
        $crawlerController = $this->objectManager->get(CrawlerController::class);
116
117
        if ($mode === 'exec') {
118
            $crawlerController->registerQueueEntriesInternallyOnly = true;
119
        }
120
121
        if (defined('TYPO3_MODE') && 'BE' === TYPO3_MODE) {
122
            // Crawler is called over TYPO3 BE
123
            $pageId = 1;
124
        } else {
125
            // Crawler is called over cli
126
            $pageId = MathUtility::forceIntegerInRange($startpage, 0);
127
        }
128
129
        $configurationKeys = $this->getConfigurationKeys($conf);
130
131
        if (!is_array($configurationKeys)) {
132
            $configurations = $crawlerController->getUrlsForPageId($pageId);
133
            if (is_array($configurations)) {
134
                $configurationKeys = array_keys($configurations);
135
            } else {
136
                $configurationKeys = [];
137
            }
138
        }
139
140
        if ($mode === 'queue' || $mode === 'exec') {
141
            $reason = new Reason();
142
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
143
            $reason->setDetailText('The cli script of the crawler added to the queue');
144
            EventDispatcher::getInstance()->post(
145
                'invokeQueueChange',
146
                $crawlerController->setID,
147
                ['reason' => $reason]
148
            );
149
        }
150
151
        if ($crawlerController->extensionSettings['cleanUpOldQueueEntries']) {
152
            $crawlerController->cleanUpOldQueueEntries();
153
        }
154
155
        $crawlerController->setID = (int) GeneralUtility::md5int(microtime());
156
        $crawlerController->getPageTreeAndUrls(
157
            $pageId,
158
            MathUtility::forceIntegerInRange($depth, 0, 99),
159
            $crawlerController->getCurrentTime(),
160
            MathUtility::forceIntegerInRange($number ? intval($number) : 30, 1, 1000),
161
            $mode === 'queue' || $mode === 'exec',
162
            $mode === 'url',
163
            [],
164
            $configurationKeys
165
        );
166
167
        if ($mode === 'url') {
168
            $this->outputLine('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>');
169
        } elseif ($mode === 'exec') {
170
            $this->outputLine('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>');
171
            $this->outputLine('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL);
172
            $this->outputLine('<info>Processing</info>' . PHP_EOL);
173
174
            foreach ($crawlerController->queueEntries as $queueRec) {
175
                $p = unserialize($queueRec['parameters']);
176
                $this->outputLine('<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL);
177
                $result = $crawlerController->readUrlFromArray($queueRec);
178
179
                $requestResult = unserialize($result['content']);
180
                if (is_array($requestResult)) {
181
                    $resLog = is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) . implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
182
                    $this->outputLine('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
183
                } else {
184
                    $this->outputLine('<errror>Error checking Crawler Result:  ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . PHP_EOL . '</errror>' . PHP_EOL);
185
                }
186
            }
187
        } elseif ($mode === 'queue') {
188
            $this->outputLine('<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL);
189
            $this->outputLine('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL);
190
        } else {
191
            $this->outputLine('<info>' . count($crawlerController->urlList) . ' entries found for processing. (Use --mode to decide action):</info>' . PHP_EOL);
192
            $this->outputLine('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL);
193
        }
194
    }
195
196
    /**
197
     * Crawler Command - Crawling the URLs from the queue
198
     *
199
     * Examples:
200
     *
201
     * --- Will trigger the crawler which starts to process the queue entires
202
     * $ typo3cms crawler:crawlqueue
203
     *
204
     * @param int $amount How many pages should be crawled during that run.
205
     * @param int $sleeptime Amount of milliseconds which the system should use to relax between crawls.
206
     * @param int $sleepafter Amount of seconds which the system should use to relax after all crawls are done.
207
     *
208
     * @return int
209
     */
210
    public function crawlQueueCommand($amount = 0, $sleeptime = 0, $sleepafter = 0)
211
    {
212
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
213
214
        /** @var CrawlerController $crawlerController */
215
        $crawlerController = $this->objectManager->get(CrawlerController::class);
216
        /** @var QueueRepository $queueRepository */
217
        $queueRepository = $this->objectManager->get(QueueRepository::class);
218
219
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
220
        $settings = is_array($settings) ? $settings : [];
221
        $crawlerController->setExtensionSettings($settings);
222
223
        if (!$crawlerController->getDisabled() && $crawlerController->CLI_checkAndAcquireNewProcess($crawlerController->CLI_buildProcessId())) {
224
            $countInARun = $amount ? intval($amount) : $crawlerController->extensionSettings['countInARun'];
225
            $sleepAfterFinish = $sleeptime ? intval($sleeptime) : $crawlerController->extensionSettings['sleepAfterFinish'];
226
            $sleepTime = $sleepafter ? intval($sleepafter) : $crawlerController->extensionSettings['sleepTime'];
227
228
            try {
229
                // Run process:
230
                $result = $crawlerController->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
231
            } catch (\Exception $e) {
232
                $this->outputLine('<warning>'. get_class($e) . ': ' . $e->getMessage() . '</warning>');
233
                $result = self::CLI_STATUS_ABORTED;
234
            }
235
236
            // Cleanup
237
            $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
238
            $crawlerController->CLI_releaseProcesses($crawlerController->CLI_buildProcessId());
239
240
            $this->outputLine('<info>Unprocessed Items remaining:' . $queueRepository->countUnprocessedItems() . ' (' . $crawlerController->CLI_buildProcessId() . ')</info>');
241
            $result |= ($queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
242
        } else {
243
            $result |= self::CLI_STATUS_ABORTED;
244
        }
245
246
        return $result;
247
248
    }
249
250
    /**
251
     * Obtains configuration keys from the CLI arguments
252
     *
253
     * @param $conf string
254
     * @return array
255
     */
256
    private function getConfigurationKeys($conf)
257
    {
258
        $parameter = trim($conf);
259
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
260
    }
261
}
262