1 | <?php |
||||
2 | |||||
3 | namespace PhpTek\Exodus\Tool; |
||||
4 | |||||
5 | use phpQuery; |
||||
6 | use DOMDocument; |
||||
7 | use DOMElement; |
||||
8 | use PhpTek\Exodus\Tool\StaticSiteMimeProcessor; |
||||
9 | use PhpTek\Exodus\Tool\StaticSiteUtils; |
||||
10 | use SilverStripe\Core\Injector\Injectable; |
||||
11 | use SilverStripe\Control\Director; |
||||
12 | use SilverStripe\Control\HTTPResponse; |
||||
13 | use SilverStripe\Core\Config\Configurable; |
||||
14 | use SilverStripe\Core\Convert; |
||||
15 | use SilverStripe\Core\TempFolder; |
||||
16 | |||||
17 | /** |
||||
18 | * This tool uses a combination of cURL and phpQuery to extract content from a URL. |
||||
19 | * |
||||
20 | * The URL is first downloaded using cURL, and then passed into phpQuery for processing. |
||||
21 | * Given a set of fieldnames and CSS selectors corresponding to them, a map of content |
||||
22 | * fields will be returned. |
||||
23 | * |
||||
24 | * If the URL represents a file-based Mime-Type, a Silverstripe `File` object is created and the |
||||
25 | * physical file it represents can then be post-processed and saved to the dstabase and F/S. |
||||
26 | * |
||||
27 | * @package phptek/silverstripe-exodus |
||||
28 | * @author Sam Minee <[email protected]> |
||||
29 | * @author Russell Michell <[email protected]> |
||||
30 | */ |
||||
31 | class StaticSiteContentExtractor |
||||
32 | { |
||||
33 | use Injectable; |
||||
34 | use Configurable; |
||||
35 | |||||
36 | /** |
||||
37 | * |
||||
38 | * @var string |
||||
39 | */ |
||||
40 | protected $url = null; |
||||
41 | |||||
42 | /** |
||||
43 | * |
||||
44 | * @var string |
||||
45 | */ |
||||
46 | protected $mime = null; |
||||
47 | |||||
48 | /** |
||||
49 | * This is an HTML page's source markup. |
||||
50 | * |
||||
51 | * @var string |
||||
52 | */ |
||||
53 | protected $content = null; |
||||
54 | |||||
55 | /** |
||||
56 | * |
||||
57 | * @var phpQueryObject |
||||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||||
58 | */ |
||||
59 | protected $phpQuery = null; |
||||
60 | |||||
61 | /** |
||||
62 | * |
||||
63 | * @var string |
||||
64 | */ |
||||
65 | protected $tmpFileName = ''; |
||||
66 | |||||
67 | /** |
||||
68 | * "Caches" the mime-processor for use throughout |
||||
69 | * |
||||
70 | * @var StaticSiteMimeProcessor |
||||
71 | */ |
||||
72 | protected $mimeProcessor; |
||||
73 | |||||
74 | /** |
||||
75 | * Holds the StaticSiteUtils object on construct |
||||
76 | * |
||||
77 | * @var Object |
||||
78 | */ |
||||
79 | protected $utils; |
||||
80 | |||||
81 | /** |
||||
82 | * Create a StaticSiteContentExtractor for a single URL/. |
||||
83 | * |
||||
84 | * @param string $url The absolute URL to extract content from |
||||
85 | * @param string $mime The Mime-Type |
||||
86 | * @param string $content (Optional. Useful only for crude tests that avoid rigging-up a URL to parse) |
||||
87 | * @return void |
||||
88 | * @throws \Exception |
||||
89 | */ |
||||
90 | public function __construct($url, $mime, $content = null) |
||||
91 | { |
||||
92 | $this->url = $url; |
||||
93 | $this->mime = $mime; |
||||
94 | $this->content = $content; |
||||
95 | $this->mimeProcessor = singleton(StaticSiteMimeProcessor::class); |
||||
96 | $this->utils = singleton(StaticSiteUtils::class); |
||||
97 | |||||
98 | $this->utils->log(sprintf('Begin extraction for URL: %s and Mime: %s', $this->url, $this->mime)); |
||||
99 | } |
||||
100 | |||||
101 | /** |
||||
102 | * Extract content for map of field => css-selector pairs |
||||
103 | * |
||||
104 | * @param array $selectorMap A map of field name => css-selector |
||||
105 | * @param StaticSiteContentItem $item The item to extract |
||||
0 ignored issues
–
show
The type
PhpTek\Exodus\Tool\StaticSiteContentItem was not found. Maybe you did not declare it correctly or list all dependencies?
The issue could also be caused by a filter entry in the build configuration.
If the path has been excluded in your configuration, e.g. filter:
dependency_paths: ["lib/*"]
For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths ![]() |
|||||
106 | * @return array Map of fieldname => ['selector' => selector, 'content' => field content] |
||||
107 | */ |
||||
108 | public function extractMapAndSelectors($selectorMap, $item): array |
||||
109 | { |
||||
110 | if (!$this->phpQuery) { |
||||
111 | $this->fetchContent(); |
||||
112 | } |
||||
113 | |||||
114 | $output = []; |
||||
115 | foreach ($selectorMap as $fieldName => $extractionRules) { |
||||
116 | if (!is_array($extractionRules)) { |
||||
117 | $extractionRules = [$extractionRules]; |
||||
118 | } |
||||
119 | |||||
120 | foreach ($extractionRules as $extractionRule) { |
||||
121 | $content = ''; |
||||
122 | |||||
123 | if (!is_array($extractionRule)) { |
||||
124 | $extractionRule = ['selector' => $extractionRule]; |
||||
125 | } |
||||
126 | |||||
127 | if ($this->isMimeHTML()) { |
||||
128 | $cssSelector = $extractionRule['selector'] ?? ''; |
||||
129 | $attribute = $extractionRule['attribute'] ?? ''; |
||||
130 | $outerHTML = $extractionRule['outerhtml'] ?? false; |
||||
131 | $content = $this->extractField($cssSelector, $attribute, $outerHTML); |
||||
132 | } elseif ($this->isMimeFileOrImage()) { |
||||
133 | $content = $item->externalId; |
||||
134 | } |
||||
135 | |||||
136 | if (!$content) { |
||||
137 | continue; |
||||
138 | } |
||||
139 | |||||
140 | // Further processing |
||||
141 | if ($this->isMimeHTML()) { |
||||
142 | $content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content); |
||||
143 | } |
||||
144 | |||||
145 | if (!$content) { |
||||
146 | continue; |
||||
147 | } |
||||
148 | |||||
149 | if (!empty($extractionRule['plaintext'])) { |
||||
150 | $content = Convert::html2raw($content); |
||||
151 | } |
||||
152 | |||||
153 | // We found a match, select that one and ignore any other selectors |
||||
154 | $output[$fieldName] = $extractionRule; |
||||
155 | $output[$fieldName]['content'] = $content; |
||||
156 | break; |
||||
157 | } |
||||
158 | } |
||||
159 | |||||
160 | return $output; |
||||
161 | } |
||||
162 | |||||
163 | /** |
||||
164 | * Extract content for a single css selector |
||||
165 | * |
||||
166 | * @param string $cssSelector The CSS selector for which to extract content. |
||||
167 | * @param string $attribute If set, the value will be from this HTML attribute. |
||||
168 | * @param bool $outherHTML Should we return the full HTML markup of the whole field? |
||||
169 | * @return string The content for the passed $cssSelector. |
||||
170 | */ |
||||
171 | public function extractField(string $cssSelector, string $attribute = '', bool $outerHTML = false): string |
||||
172 | { |
||||
173 | if (!$this->phpQuery) { |
||||
174 | // Sets $this->phpQuery - weird pattern |
||||
175 | $this->fetchContent(); |
||||
176 | } |
||||
177 | |||||
178 | $elements = $this->phpQuery[$cssSelector]; |
||||
179 | |||||
180 | // @todo temporary workaround for File objects |
||||
181 | if (!$elements) { |
||||
182 | return ''; |
||||
183 | } |
||||
184 | |||||
185 | // just return the inner HTML for this node |
||||
186 | if (!$outerHTML || !$attribute) { |
||||
187 | return trim($elements->html()); |
||||
188 | } |
||||
189 | |||||
190 | $result = ''; |
||||
191 | foreach ($elements as $element) { |
||||
192 | // Get the full html for this element |
||||
193 | if ($outerHTML) { |
||||
194 | $result .= $this->getOuterHTML($element); |
||||
195 | } |
||||
196 | // Get the value of an attribute |
||||
197 | elseif ($attribute && trim($element->getAttribute($attribute))) { |
||||
198 | $result .= ($element->getAttribute($attribute)) . PHP_EOL; |
||||
199 | } |
||||
200 | } |
||||
201 | |||||
202 | return trim($result); |
||||
203 | } |
||||
204 | |||||
205 | /** |
||||
206 | * Strip away content from $content that matches one or many css selectors. |
||||
207 | * |
||||
208 | * @param array $excludeSelectors |
||||
209 | * @param string $parentSelector |
||||
210 | * @param string $content |
||||
211 | * @return string |
||||
212 | */ |
||||
213 | protected function excludeContent($excludeSelectors, $parentSelector, $content) |
||||
214 | { |
||||
215 | if (!$excludeSelectors) { |
||||
0 ignored issues
–
show
The expression
$excludeSelectors of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using ![]() |
|||||
216 | return $content; |
||||
217 | } |
||||
218 | |||||
219 | foreach ($excludeSelectors as $excludeSelector) { |
||||
220 | if (!trim($excludeSelector)) { |
||||
221 | continue; |
||||
222 | } |
||||
223 | $element = $this->phpQuery[$parentSelector . ' ' . $excludeSelector]; |
||||
224 | if ($element) { |
||||
225 | $remove = $element->htmlOuter(); |
||||
226 | $content = str_replace($remove, '', $content); |
||||
227 | $this->utils->log(' - Excluded content from "' . $parentSelector . ' ' . $excludeSelector . '"'); |
||||
228 | } |
||||
229 | } |
||||
230 | return $content; |
||||
231 | } |
||||
232 | |||||
233 | /** |
||||
234 | * Get the full HTML of the element and its children |
||||
235 | * |
||||
236 | * @param DOMElement $element |
||||
237 | * @return string |
||||
238 | */ |
||||
239 | protected function getOuterHTML(DOMElement $element) |
||||
240 | { |
||||
241 | $doc = new DOMDocument(); |
||||
242 | $doc->formatOutput = false; |
||||
243 | $doc->preserveWhiteSpace = true; |
||||
244 | $doc->substituteEntities = false; |
||||
245 | $doc->appendChild($doc->importNode($element, true)); |
||||
246 | return $doc->saveHTML(); |
||||
247 | } |
||||
248 | |||||
249 | /** |
||||
250 | * |
||||
251 | * @return string |
||||
252 | */ |
||||
253 | public function getContent() |
||||
254 | { |
||||
255 | return $this->content; |
||||
256 | } |
||||
257 | |||||
258 | /** |
||||
259 | * Fetch the content, initialise $this->content and $this->phpQuery. |
||||
260 | * Initialise the latter only if an appropriate mime-type matches. |
||||
261 | * |
||||
262 | * @return void |
||||
263 | * @todo deal-to defaults when $this->mime isn't matched. |
||||
264 | */ |
||||
265 | protected function fetchContent() |
||||
266 | { |
||||
267 | $this->utils->log(" - Fetching {$this->url} ({$this->mime})"); |
||||
268 | |||||
269 | // Set some proxy options for phpCrawler |
||||
270 | $curlOpts = singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev()); |
||||
271 | $response = $this->curlRequest($this->url, "GET", null, null, $curlOpts); |
||||
272 | |||||
273 | if ($response == 'file') { |
||||
274 | // Just stop here for files & images |
||||
275 | return; |
||||
276 | } |
||||
277 | |||||
278 | $this->content = $response->getBody(); |
||||
279 | |||||
280 | // Clean up the content so phpQuery doesn't bork |
||||
281 | $this->prepareContent(); |
||||
282 | $this->phpQuery = phpQuery::newDocument($this->content); |
||||
0 ignored issues
–
show
$this->content of type string is incompatible with the type unknown_type expected by parameter $markup of phpQuery::newDocument() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() It seems like
phpQuery::newDocument($this->content) of type phpQueryObject is incompatible with the declared type PhpTek\Exodus\Tool\phpQueryObject of property $phpQuery .
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property. Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property.. ![]() |
|||||
283 | } |
||||
284 | |||||
285 | /** |
||||
286 | * Use cURL to request a URL, and return a HTTPResponse object (`SiteTree`) or write curl output directly to a tmp file |
||||
287 | * ready for uploading to SilverStripe via Upload#load() (`File` and `Image`) |
||||
288 | * |
||||
289 | * @todo Refactor using Guzzle |
||||
290 | * @param string $url |
||||
291 | * @param string $method |
||||
292 | * @param string $data |
||||
293 | * @param string $headers |
||||
294 | * @param array $curlOptions |
||||
295 | * @return boolean | HTTPResponse |
||||
296 | * @todo Add checks when fetching multi Mb images to ignore anything over 2Mb?? |
||||
297 | */ |
||||
298 | protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = []) |
||||
299 | { |
||||
300 | $this->utils->log(" - CURL START: {$this->url} ({$this->mime})"); |
||||
301 | |||||
302 | $ch = curl_init(); |
||||
303 | $timeout = 10; |
||||
304 | |||||
305 | curl_setopt($ch, CURLOPT_URL, $url); |
||||
306 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); |
||||
307 | curl_setopt($ch, CURLOPT_USERAGENT, $this->config()->get('user_agent')); |
||||
308 | curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method); |
||||
309 | curl_setopt($ch, CURLOPT_HEADER, 1); |
||||
310 | curl_setopt($ch, CURLOPT_TIMEOUT, 120); |
||||
311 | |||||
312 | if ($headers) { |
||||
313 | curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); |
||||
314 | } |
||||
315 | |||||
316 | // Add fields to POST and PUT requests |
||||
317 | if ($method == 'POST') { |
||||
318 | curl_setopt($ch, CURLOPT_POST, 1); |
||||
319 | curl_setopt($ch, CURLOPT_POSTFIELDS, $data); |
||||
320 | } elseif ($method == 'PUT') { |
||||
321 | $put = fopen("php://temp", 'r+'); |
||||
322 | fwrite($put, $data); |
||||
0 ignored issues
–
show
It seems like
$data can also be of type null ; however, parameter $data of fwrite() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
323 | fseek($put, 0); |
||||
324 | |||||
325 | curl_setopt($ch, CURLOPT_PUT, 1); |
||||
326 | curl_setopt($ch, CURLOPT_INFILE, $put); |
||||
327 | curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data)); |
||||
0 ignored issues
–
show
It seems like
$data can also be of type null ; however, parameter $string of strlen() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
328 | } |
||||
329 | |||||
330 | // Follow redirects |
||||
331 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
||||
332 | |||||
333 | // Set any custom options passed to the request() function |
||||
334 | curl_setopt_array($ch, $curlOptions); |
||||
335 | |||||
336 | // Run request |
||||
337 | curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); |
||||
338 | // See: http://forums.devshed.com/php-development-5/curlopt-timeout-option-for-curl-calls-isn-t-being-obeyed-605642.html |
||||
339 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); // No. seconds to wait while trying to connect. |
||||
340 | |||||
341 | // Deal to files, write to them directly and then return |
||||
342 | if ($this->mimeProcessor->isOfFileOrImage($this->mime)) { |
||||
343 | $tmp_name = @tempnam(TempFolder::getTempFolder('/') . '/' . rand(), 'tmp'); |
||||
344 | $fp = fopen($tmp_name, 'w+'); |
||||
0 ignored issues
–
show
It seems like
$tmp_name can also be of type false ; however, parameter $filename of fopen() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
345 | curl_setopt($ch, CURLOPT_HEADER, 0); // We do not want _any_ header info, it corrupts the file data |
||||
346 | curl_setopt($ch, CURLOPT_FILE, $fp); // write curl response directly to file, no messing about |
||||
347 | curl_exec($ch); |
||||
348 | curl_close($ch); |
||||
349 | fclose($fp); |
||||
350 | |||||
351 | $this->setTmpFileName($tmp_name); // Set a tmp filename |
||||
0 ignored issues
–
show
It seems like
$tmp_name can also be of type false ; however, parameter $tmp of PhpTek\Exodus\Tool\Stati...actor::setTmpFileName() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
352 | |||||
353 | return 'file'; |
||||
0 ignored issues
–
show
|
|||||
354 | } |
||||
355 | |||||
356 | $fullResponseBody = curl_exec($ch); |
||||
357 | $curlError = curl_error($ch); |
||||
358 | @list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $fullResponseBody), 2); |
||||
359 | |||||
360 | if (preg_match("#^HTTP/1.1 100#", $responseHeaders)) { |
||||
361 | list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $responseBody), 2); |
||||
362 | } |
||||
363 | |||||
364 | $responseHeaders = explode("\n", trim($responseHeaders)); |
||||
365 | // Shift off the HTTP response code |
||||
366 | array_shift($responseHeaders); |
||||
367 | |||||
368 | $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
||||
369 | |||||
370 | curl_close($ch); |
||||
371 | |||||
372 | if ($curlError !== '' || $statusCode == 0) { |
||||
373 | $this->utils->log(" - CURL ERROR: Error: $curlError Status: $statusCode"); |
||||
374 | $statusCode = 500; |
||||
375 | } |
||||
376 | |||||
377 | $response = HTTPResponse::create($responseBody, $statusCode); |
||||
378 | |||||
379 | foreach ($responseHeaders as $headerLine) { |
||||
380 | if (strpos($headerLine, ":") !== false) { |
||||
381 | list($headerName, $headerVal) = explode(":", $headerLine, 2); |
||||
382 | $response->addHeader(trim($headerName), trim($headerVal)); |
||||
383 | } |
||||
384 | } |
||||
385 | |||||
386 | $this->utils->log(" - CURL END: {$this->url}. Status: $statusCode. ({$this->mime})"); |
||||
387 | return $response; |
||||
388 | } |
||||
389 | |||||
390 | /** |
||||
391 | * |
||||
392 | * @param string $tmp |
||||
393 | * @return void |
||||
394 | */ |
||||
395 | public function setTmpFileName($tmp) |
||||
396 | { |
||||
397 | $this->tmpFileName = $tmp; |
||||
398 | } |
||||
399 | |||||
400 | /** |
||||
401 | * |
||||
402 | * @return string |
||||
403 | */ |
||||
404 | public function getTmpFileName() |
||||
405 | { |
||||
406 | return $this->tmpFileName; |
||||
407 | } |
||||
408 | |||||
409 | /** |
||||
410 | * @see {@link StaticSiteMimeProcessor} |
||||
411 | * @return boolean |
||||
412 | */ |
||||
413 | public function isMimeHTML() |
||||
414 | { |
||||
415 | return $this->mimeProcessor->isOfHTML($this->mime); |
||||
416 | } |
||||
417 | |||||
418 | /** |
||||
419 | * @see {@link StaticSiteMimeProcessor} |
||||
420 | * @return boolean |
||||
421 | */ |
||||
422 | public function isMimeFile() |
||||
423 | { |
||||
424 | return $this->mimeProcessor->isOfFile($this->mime); |
||||
425 | } |
||||
426 | |||||
427 | /** |
||||
428 | * @see {@link StaticSiteMimeProcessor} |
||||
429 | * @return boolean |
||||
430 | */ |
||||
431 | public function isMimeImage() |
||||
432 | { |
||||
433 | return $this->mimeProcessor->isOfImage($this->mime); |
||||
434 | } |
||||
435 | |||||
436 | /** |
||||
437 | * @see {@link StaticSiteMimeProcessor} |
||||
438 | * @return boolean |
||||
439 | */ |
||||
440 | public function isMimeFileOrImage() |
||||
441 | { |
||||
442 | return $this->mimeProcessor->isOfFileOrImage($this->mime); |
||||
443 | } |
||||
444 | |||||
445 | /** |
||||
446 | * Pre-process the content so phpQuery can parse it without violently barfing. |
||||
447 | * |
||||
448 | * @return void |
||||
449 | */ |
||||
450 | public function prepareContent(): void |
||||
451 | { |
||||
452 | // Trim it |
||||
453 | $this->content = trim($this->content ?? ''); |
||||
454 | |||||
455 | // Ensure the content begins with the 'html' tag |
||||
456 | if (stripos($this->content, '<html') === false) { |
||||
457 | $this->content = '<html>' . $this->content; |
||||
458 | $this->utils->log('Warning: content was missing opening "<html>" tag.'); |
||||
459 | } |
||||
460 | } |
||||
461 | } |
||||
462 |