|
1
|
|
|
<?php |
|
2
|
|
|
declare(strict_types=1); |
|
3
|
|
|
namespace AOE\Crawler\Command; |
|
4
|
|
|
|
|
5
|
|
|
/*************************************************************** |
|
6
|
|
|
* Copyright notice |
|
7
|
|
|
* |
|
8
|
|
|
* (c) 2019 AOE GmbH <[email protected]> |
|
9
|
|
|
* |
|
10
|
|
|
* All rights reserved |
|
11
|
|
|
* |
|
12
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
|
13
|
|
|
* free software; you can redistribute it and/or modify |
|
14
|
|
|
* it under the terms of the GNU General Public License as published by |
|
15
|
|
|
* the Free Software Foundation; either version 3 of the License, or |
|
16
|
|
|
* (at your option) any later version. |
|
17
|
|
|
* |
|
18
|
|
|
* The GNU General Public License can be found at |
|
19
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
|
20
|
|
|
* |
|
21
|
|
|
* This script is distributed in the hope that it will be useful, |
|
22
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
23
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
24
|
|
|
* GNU General Public License for more details. |
|
25
|
|
|
* |
|
26
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
|
27
|
|
|
***************************************************************/ |
|
28
|
|
|
|
|
29
|
|
|
use AOE\Crawler\Controller\CrawlerController; |
|
30
|
|
|
use AOE\Crawler\Domain\Model\Reason; |
|
31
|
|
|
use AOE\Crawler\Event\EventDispatcher; |
|
32
|
|
|
use Symfony\Component\Console\Input\InputInterface; |
|
33
|
|
|
use Symfony\Component\Console\Input\InputOption; |
|
34
|
|
|
use Symfony\Component\Console\Output\OutputInterface; |
|
35
|
|
|
use Symfony\Component\Console\Command\Command; |
|
36
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
|
37
|
|
|
use TYPO3\CMS\Core\Utility\MathUtility; |
|
38
|
|
|
use TYPO3\CMS\Extbase\Object\ObjectManager; |
|
39
|
|
|
|
|
40
|
|
|
class BuildQueueCommand extends Command |
|
41
|
|
|
{ |
|
42
|
|
|
protected function configure(): void |
|
43
|
|
|
{ |
|
44
|
|
|
$this->setHelp( |
|
45
|
|
|
'Try "typo3 help crawler:flushQueue" to see your options' . chr(10) . chr(10) . |
|
46
|
|
|
'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; |
|
47
|
|
|
It can put entries in the queue from command line options, return the list of URLs and even execute |
|
48
|
|
|
all entries right away without having to queue them up - this can be useful for immediate re-cache, |
|
49
|
|
|
re-indexing or static publishing from command line.' . chr(10) . chr(10) . |
|
50
|
|
|
' |
|
51
|
|
|
Examples: |
|
52
|
|
|
--- Re-cache pages from page 7 and two levels down, executed immediately |
|
53
|
|
|
$ typo3 crawler:buildQueue --page 7 --depth 2 --conf defaultConfiguration --mode exec |
|
54
|
|
|
|
|
55
|
|
|
--- Put entries for re-caching pages from page 7 into queue, 4 every minute. |
|
56
|
|
|
$ typo3 crawler:buildQueue --page 7 --depth 0 --conf defaultConfiguration --mode queue --number 4 |
|
57
|
|
|
' |
|
58
|
|
|
); |
|
59
|
|
|
|
|
60
|
|
|
$this->addOption( |
|
61
|
|
|
'conf', |
|
62
|
|
|
'c', |
|
63
|
|
|
InputOption::VALUE_REQUIRED, |
|
64
|
|
|
'A comma separated list of crawler configurations' |
|
65
|
|
|
); |
|
66
|
|
|
|
|
67
|
|
|
$this->addOption( |
|
68
|
|
|
'page', |
|
69
|
|
|
'p', |
|
70
|
|
|
InputOption::VALUE_OPTIONAL, |
|
71
|
|
|
'The page from where the queue building should start', |
|
72
|
|
|
0 |
|
73
|
|
|
); |
|
74
|
|
|
|
|
75
|
|
|
$this->addOption( |
|
76
|
|
|
'depth', |
|
77
|
|
|
'd', |
|
78
|
|
|
InputOption::VALUE_OPTIONAL, |
|
79
|
|
|
'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.', |
|
80
|
|
|
0 |
|
81
|
|
|
); |
|
82
|
|
|
|
|
83
|
|
|
$this->addOption( |
|
84
|
|
|
'mode', |
|
85
|
|
|
'm', |
|
86
|
|
|
InputOption::VALUE_OPTIONAL, |
|
87
|
|
|
'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!' |
|
88
|
|
|
); |
|
89
|
|
|
|
|
90
|
|
|
$this->addOption( |
|
91
|
|
|
'number', |
|
92
|
|
|
'', |
|
93
|
|
|
InputOption::VALUE_OPTIONAL, |
|
94
|
|
|
'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"', |
|
95
|
|
|
0 |
|
96
|
|
|
); |
|
97
|
|
|
} |
|
98
|
|
|
|
|
99
|
|
|
/** |
|
100
|
|
|
* Crawler Command - Submitting URLs to be crawled. |
|
101
|
|
|
* |
|
102
|
|
|
* Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; |
|
103
|
|
|
* It can put entries in the queue from command line options, return the list of URLs and even execute |
|
104
|
|
|
* all entries right away without having to queue them up - this can be useful for immediate re-cache, |
|
105
|
|
|
* re-indexing or static publishing from command line. |
|
106
|
|
|
* |
|
107
|
|
|
* Examples: |
|
108
|
|
|
* |
|
109
|
|
|
* --- Re-cache pages from page 7 and two levels down, executed immediately |
|
110
|
|
|
* $ typo3 crawler:buildQueue --page 7 --depth 2 --conf defaultConfiguration --mode exec |
|
111
|
|
|
|
|
112
|
|
|
* |
|
113
|
|
|
* --- Put entries for re-caching pages from page 7 into queue, 4 every minute. |
|
114
|
|
|
* $ typo3 crawler:buildQueue --page 7 --depth 0 --conf defaultConfiguration --mode queue --number 4 |
|
115
|
|
|
* |
|
116
|
|
|
*/ |
|
117
|
|
|
protected function execute(InputInterface $input, OutputInterface $output) |
|
118
|
|
|
{ |
|
119
|
|
|
|
|
120
|
|
|
$mode = $input->getOption('mode'); |
|
121
|
|
|
|
|
122
|
|
|
$objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
|
123
|
|
|
|
|
124
|
|
|
/** @var CrawlerController $crawlerController */ |
|
125
|
|
|
$crawlerController = $objectManager->get(CrawlerController::class); |
|
126
|
|
|
|
|
127
|
|
|
if ( $mode === 'exec') { |
|
128
|
|
|
$crawlerController->registerQueueEntriesInternallyOnly = true; |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
|
|
if ($this->request instanceof \TYPO3\CMS\Extbase\Mvc\Cli\Request) { |
|
|
|
|
|
|
132
|
|
|
$pageId = MathUtility::forceIntegerInRange($input->getOption('startpage'), 0); |
|
133
|
|
|
} else { |
|
134
|
|
|
// Crawler is called over Backend |
|
135
|
|
|
$pageId = 1; |
|
136
|
|
|
} |
|
137
|
|
|
|
|
138
|
|
|
$configurationKeys = $this->getConfigurationKeys($input->getOption('conf')); |
|
139
|
|
|
|
|
140
|
|
|
if (!is_array($configurationKeys)) { |
|
141
|
|
|
$configurations = $crawlerController->getUrlsForPageId($pageId); |
|
142
|
|
|
if (is_array($configurations)) { |
|
143
|
|
|
$configurationKeys = array_keys($configurations); |
|
144
|
|
|
} else { |
|
145
|
|
|
$configurationKeys = []; |
|
146
|
|
|
} |
|
147
|
|
|
} |
|
148
|
|
|
|
|
149
|
|
|
if ($mode === 'queue' || $mode === 'exec') { |
|
150
|
|
|
$reason = new Reason(); |
|
151
|
|
|
$reason->setReason(Reason::REASON_GUI_SUBMIT); |
|
152
|
|
|
$reason->setDetailText('The cli script of the crawler added to the queue'); |
|
153
|
|
|
EventDispatcher::getInstance()->post( |
|
154
|
|
|
'invokeQueueChange', |
|
155
|
|
|
$crawlerController->setID, |
|
156
|
|
|
['reason' => $reason] |
|
157
|
|
|
); |
|
158
|
|
|
} |
|
159
|
|
|
|
|
160
|
|
|
if ($crawlerController->extensionSettings['cleanUpOldQueueEntries']) { |
|
161
|
|
|
$crawlerController->cleanUpOldQueueEntries(); |
|
162
|
|
|
} |
|
163
|
|
|
|
|
164
|
|
|
$crawlerController->setID = (int) GeneralUtility::md5int(microtime()); |
|
165
|
|
|
$crawlerController->getPageTreeAndUrls( |
|
166
|
|
|
$pageId, |
|
167
|
|
|
MathUtility::forceIntegerInRange($input->getOption('depth'), 0, 99), |
|
168
|
|
|
$crawlerController->getCurrentTime(), |
|
169
|
|
|
MathUtility::forceIntegerInRange($input->getOption('number') ?: 30, 1, 1000), |
|
170
|
|
|
$mode === 'queue' || $mode === 'exec', |
|
171
|
|
|
$mode === 'url', |
|
172
|
|
|
[], |
|
173
|
|
|
$configurationKeys |
|
174
|
|
|
); |
|
175
|
|
|
|
|
176
|
|
|
if ($mode === 'url') { |
|
177
|
|
|
$output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>'); |
|
178
|
|
|
} elseif ($mode === 'exec') { |
|
179
|
|
|
$output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>'); |
|
180
|
|
|
$output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
|
181
|
|
|
$output->writeln('<info>Processing</info>' . PHP_EOL); |
|
182
|
|
|
|
|
183
|
|
|
foreach ($crawlerController->queueEntries as $queueRec) { |
|
184
|
|
|
$p = unserialize($queueRec['parameters']); |
|
185
|
|
|
$output->writeln('<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL); |
|
186
|
|
|
$result = $crawlerController->readUrlFromArray($queueRec); |
|
187
|
|
|
|
|
188
|
|
|
$requestResult = unserialize($result['content']); |
|
189
|
|
|
if (is_array($requestResult)) { |
|
190
|
|
|
$resLog = is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) . implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : ''; |
|
191
|
|
|
$output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL); |
|
192
|
|
|
} else { |
|
193
|
|
|
$output->writeln('<errror>Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . PHP_EOL . '</errror>' . PHP_EOL); |
|
194
|
|
|
} |
|
195
|
|
|
} |
|
196
|
|
|
} elseif ($mode === 'queue') { |
|
197
|
|
|
$output->writeln('<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL); |
|
198
|
|
|
$output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
|
199
|
|
|
} else { |
|
200
|
|
|
$output->writeln('<info>' . count($crawlerController->urlList) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL); |
|
201
|
|
|
$output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
|
202
|
|
|
} |
|
203
|
|
|
} |
|
204
|
|
|
|
|
205
|
|
|
/** |
|
206
|
|
|
* Obtains configuration keys from the CLI arguments |
|
207
|
|
|
* |
|
208
|
|
|
* @param $conf string |
|
209
|
|
|
* @return array |
|
210
|
|
|
*/ |
|
211
|
|
|
private function getConfigurationKeys($conf) |
|
212
|
|
|
{ |
|
213
|
|
|
$parameter = trim($conf); |
|
214
|
|
|
return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []); |
|
215
|
|
|
} |
|
216
|
|
|
|
|
217
|
|
|
|
|
218
|
|
|
} |
|
219
|
|
|
|
This error could be the result of:
1. Missing dependencies
PHP Analyzer uses your
composer.jsonfile (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects thecomposer.jsonto be in the root folder of your repository.Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the
requireorrequire-devsection?2. Missing use statement
PHP does not complain about undefined classes in
ìnstanceofchecks. For example, the following PHP code will work perfectly fine:If you have not tested against this specific condition, such errors might go unnoticed.