1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PhpTek\Exodus\Tool; |
4
|
|
|
|
5
|
|
|
use phpQuery; |
6
|
|
|
use DOMDocument; |
7
|
|
|
use DOMElement; |
8
|
|
|
use PhpTek\Exodus\Tool\StaticSiteMimeProcessor; |
9
|
|
|
use PhpTek\Exodus\Tool\StaticSiteUtils; |
10
|
|
|
use SilverStripe\Core\Injector\Injectable; |
11
|
|
|
use SilverStripe\Control\Director; |
12
|
|
|
use SilverStripe\Control\HTTPResponse; |
13
|
|
|
use SilverStripe\Core\Config\Configurable; |
14
|
|
|
use SilverStripe\Core\Convert; |
15
|
|
|
use SilverStripe\Core\TempFolder; |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* This tool uses a combination of cURL and phpQuery to extract content from a URL. |
19
|
|
|
* |
20
|
|
|
* The URL is first downloaded using cURL, and then passed into phpQuery for processing. |
21
|
|
|
* Given a set of fieldnames and CSS selectors corresponding to them, a map of content |
22
|
|
|
* fields will be returned. |
23
|
|
|
* |
24
|
|
|
* If the URL represents a file-based Mime-Type, a Silverstripe `File` object is created and the |
25
|
|
|
* physical file it represents can then be post-processed and saved to the dstabase and F/S. |
26
|
|
|
* |
27
|
|
|
* @package phptek/silverstripe-exodus |
28
|
|
|
* @author Sam Minee <[email protected]> |
29
|
|
|
* @author Russell Michell <[email protected]> |
30
|
|
|
*/ |
31
|
|
|
class StaticSiteContentExtractor |
32
|
|
|
{ |
33
|
|
|
use Injectable; |
34
|
|
|
use Configurable; |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* |
38
|
|
|
* @var string |
39
|
|
|
*/ |
40
|
|
|
protected $url = null; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* |
44
|
|
|
* @var string |
45
|
|
|
*/ |
46
|
|
|
protected $mime = null; |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* This is an HTML page's source markup. |
50
|
|
|
* |
51
|
|
|
* @var string |
52
|
|
|
*/ |
53
|
|
|
protected $content = null; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* |
57
|
|
|
* @var phpQueryObject |
|
|
|
|
58
|
|
|
*/ |
59
|
|
|
protected $phpQuery = null; |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* |
63
|
|
|
* @var string |
64
|
|
|
*/ |
65
|
|
|
protected $tmpFileName = ''; |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* "Caches" the mime-processor for use throughout |
69
|
|
|
* |
70
|
|
|
* @var StaticSiteMimeProcessor |
71
|
|
|
*/ |
72
|
|
|
protected $mimeProcessor; |
73
|
|
|
|
74
|
|
|
/** |
75
|
|
|
* Holds the StaticSiteUtils object on construct |
76
|
|
|
* |
77
|
|
|
* @var Object |
78
|
|
|
*/ |
79
|
|
|
protected $utils; |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* Create a StaticSiteContentExtractor for a single URL/. |
83
|
|
|
* |
84
|
|
|
* @param string $url The absolute URL to extract content from |
85
|
|
|
* @param string $mime The Mime-Type |
86
|
|
|
* @param string $content (Optional. Useful only for crude tests that avoid rigging-up a URL to parse) |
87
|
|
|
* @return void |
88
|
|
|
* @throws \Exception |
89
|
|
|
*/ |
90
|
|
|
public function __construct($url, $mime, $content = null) |
91
|
|
|
{ |
92
|
|
|
$this->url = $url; |
93
|
|
|
$this->mime = $mime; |
94
|
|
|
$this->content = $content; |
95
|
|
|
$this->mimeProcessor = singleton(StaticSiteMimeProcessor::class); |
96
|
|
|
$this->utils = singleton(StaticSiteUtils::class); |
97
|
|
|
|
98
|
|
|
$this->utils->log(sprintf('Begin extraction for URL: %s and Mime: %s', $this->url, $this->mime)); |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
/** |
102
|
|
|
* Extract content for map of field => css-selector pairs |
103
|
|
|
* |
104
|
|
|
* @param array $selectorMap A map of field name => css-selector |
105
|
|
|
* @param StaticSiteContentItem $item The item to extract |
|
|
|
|
106
|
|
|
* @return array Map of fieldname => ['selector' => selector, 'content' => field content] |
107
|
|
|
*/ |
108
|
|
|
public function extractMapAndSelectors($selectorMap, $item): array |
109
|
|
|
{ |
110
|
|
|
if (!$this->phpQuery) { |
111
|
|
|
$this->fetchContent(); |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
$output = []; |
115
|
|
|
foreach ($selectorMap as $fieldName => $extractionRules) { |
116
|
|
|
if (!is_array($extractionRules)) { |
117
|
|
|
$extractionRules = [$extractionRules]; |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
foreach ($extractionRules as $extractionRule) { |
121
|
|
|
$content = ''; |
122
|
|
|
|
123
|
|
|
if (!is_array($extractionRule)) { |
124
|
|
|
$extractionRule = ['selector' => $extractionRule]; |
125
|
|
|
} |
126
|
|
|
|
127
|
|
|
if ($this->isMimeHTML()) { |
128
|
|
|
$cssSelector = $extractionRule['selector'] ?? ''; |
129
|
|
|
$attribute = $extractionRule['attribute'] ?? ''; |
130
|
|
|
$outerHTML = $extractionRule['outerhtml'] ?? false; |
131
|
|
|
$content = $this->extractField($cssSelector, $attribute, $outerHTML); |
132
|
|
|
} elseif ($this->isMimeFileOrImage()) { |
133
|
|
|
$content = $item->externalId; |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
if (!$content) { |
137
|
|
|
continue; |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
// Further processing |
141
|
|
|
if ($this->isMimeHTML()) { |
142
|
|
|
$content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content); |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
if (!$content) { |
146
|
|
|
continue; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
if (!empty($extractionRule['plaintext'])) { |
150
|
|
|
$content = Convert::html2raw($content); |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
// We found a match, select that one and ignore any other selectors |
154
|
|
|
$output[$fieldName] = $extractionRule; |
155
|
|
|
$output[$fieldName]['content'] = $content; |
156
|
|
|
break; |
157
|
|
|
} |
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
return $output; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
/** |
164
|
|
|
* Extract content for a single css selector |
165
|
|
|
* |
166
|
|
|
* @param string $cssSelector The CSS selector for which to extract content. |
167
|
|
|
* @param string $attribute If set, the value will be from this HTML attribute. |
168
|
|
|
* @param bool $outherHTML Should we return the full HTML markup of the whole field? |
169
|
|
|
* @return string The content for the passed $cssSelector. |
170
|
|
|
*/ |
171
|
|
|
public function extractField(string $cssSelector, string $attribute = '', bool $outerHTML = false): string |
172
|
|
|
{ |
173
|
|
|
if (!$this->phpQuery) { |
174
|
|
|
// Sets $this->phpQuery - weird pattern |
175
|
|
|
$this->fetchContent(); |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
$elements = $this->phpQuery[$cssSelector]; |
179
|
|
|
|
180
|
|
|
// @todo temporary workaround for File objects |
181
|
|
|
if (!$elements) { |
182
|
|
|
return ''; |
183
|
|
|
} |
184
|
|
|
|
185
|
|
|
// just return the inner HTML for this node |
186
|
|
|
if (!$outerHTML || !$attribute) { |
187
|
|
|
return trim($elements->html()); |
188
|
|
|
} |
189
|
|
|
|
190
|
|
|
$result = ''; |
191
|
|
|
foreach ($elements as $element) { |
192
|
|
|
// Get the full html for this element |
193
|
|
|
if ($outerHTML) { |
194
|
|
|
$result .= $this->getOuterHTML($element); |
195
|
|
|
} |
196
|
|
|
// Get the value of an attribute |
197
|
|
|
elseif ($attribute && trim($element->getAttribute($attribute))) { |
198
|
|
|
$result .= ($element->getAttribute($attribute)) . PHP_EOL; |
199
|
|
|
} |
200
|
|
|
} |
201
|
|
|
|
202
|
|
|
return trim($result); |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
/** |
206
|
|
|
* Strip away content from $content that matches one or many css selectors. |
207
|
|
|
* |
208
|
|
|
* @param array $excludeSelectors |
209
|
|
|
* @param string $parentSelector |
210
|
|
|
* @param string $content |
211
|
|
|
* @return string |
212
|
|
|
*/ |
213
|
|
|
protected function excludeContent($excludeSelectors, $parentSelector, $content) |
214
|
|
|
{ |
215
|
|
|
if (!$excludeSelectors) { |
|
|
|
|
216
|
|
|
return $content; |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
foreach ($excludeSelectors as $excludeSelector) { |
220
|
|
|
if (!trim($excludeSelector)) { |
221
|
|
|
continue; |
222
|
|
|
} |
223
|
|
|
$element = $this->phpQuery[$parentSelector . ' ' . $excludeSelector]; |
224
|
|
|
if ($element) { |
225
|
|
|
$remove = $element->htmlOuter(); |
226
|
|
|
$content = str_replace($remove, '', $content); |
227
|
|
|
$this->utils->log(' - Excluded content from "' . $parentSelector . ' ' . $excludeSelector . '"'); |
228
|
|
|
} |
229
|
|
|
} |
230
|
|
|
return $content; |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
/** |
234
|
|
|
* Get the full HTML of the element and its children |
235
|
|
|
* |
236
|
|
|
* @param DOMElement $element |
237
|
|
|
* @return string |
238
|
|
|
*/ |
239
|
|
|
protected function getOuterHTML(DOMElement $element) |
240
|
|
|
{ |
241
|
|
|
$doc = new DOMDocument(); |
242
|
|
|
$doc->formatOutput = false; |
243
|
|
|
$doc->preserveWhiteSpace = true; |
244
|
|
|
$doc->substituteEntities = false; |
245
|
|
|
$doc->appendChild($doc->importNode($element, true)); |
246
|
|
|
return $doc->saveHTML(); |
247
|
|
|
} |
248
|
|
|
|
249
|
|
|
/** |
250
|
|
|
* |
251
|
|
|
* @return string |
252
|
|
|
*/ |
253
|
|
|
public function getContent() |
254
|
|
|
{ |
255
|
|
|
return $this->content; |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
/** |
259
|
|
|
* Fetch the content, initialise $this->content and $this->phpQuery. |
260
|
|
|
* Initialise the latter only if an appropriate mime-type matches. |
261
|
|
|
* |
262
|
|
|
* @return void |
263
|
|
|
* @todo deal-to defaults when $this->mime isn't matched. |
264
|
|
|
*/ |
265
|
|
|
protected function fetchContent() |
266
|
|
|
{ |
267
|
|
|
$this->utils->log(" - Fetching {$this->url} ({$this->mime})"); |
268
|
|
|
|
269
|
|
|
// Set some proxy options for phpCrawler |
270
|
|
|
$curlOpts = singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev()); |
271
|
|
|
$response = $this->curlRequest($this->url, "GET", null, null, $curlOpts); |
272
|
|
|
|
273
|
|
|
if ($response == 'file') { |
274
|
|
|
// Just stop here for files & images |
275
|
|
|
return; |
276
|
|
|
} |
277
|
|
|
|
278
|
|
|
$this->content = $response->getBody(); |
279
|
|
|
|
280
|
|
|
// Clean up the content so phpQuery doesn't bork |
281
|
|
|
$this->prepareContent(); |
282
|
|
|
$this->phpQuery = phpQuery::newDocument($this->content); |
|
|
|
|
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* Use cURL to request a URL, and return a HTTPResponse object (`SiteTree`) or write curl output directly to a tmp file |
287
|
|
|
* ready for uploading to SilverStripe via Upload#load() (`File` and `Image`) |
288
|
|
|
* |
289
|
|
|
* @todo Refactor using Guzzle |
290
|
|
|
* @param string $url |
291
|
|
|
* @param string $method |
292
|
|
|
* @param string $data |
293
|
|
|
* @param string $headers |
294
|
|
|
* @param array $curlOptions |
295
|
|
|
* @return boolean | HTTPResponse |
296
|
|
|
* @todo Add checks when fetching multi Mb images to ignore anything over 2Mb?? |
297
|
|
|
*/ |
298
|
|
|
protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = []) |
299
|
|
|
{ |
300
|
|
|
$this->utils->log(" - CURL START: {$this->url} ({$this->mime})"); |
301
|
|
|
|
302
|
|
|
$ch = curl_init(); |
303
|
|
|
$timeout = 10; |
304
|
|
|
|
305
|
|
|
curl_setopt($ch, CURLOPT_URL, $url); |
306
|
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); |
307
|
|
|
curl_setopt($ch, CURLOPT_USERAGENT, $this->config()->get('user_agent')); |
308
|
|
|
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method); |
309
|
|
|
curl_setopt($ch, CURLOPT_HEADER, 1); |
310
|
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 120); |
311
|
|
|
|
312
|
|
|
if ($headers) { |
313
|
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
// Add fields to POST and PUT requests |
317
|
|
|
if ($method == 'POST') { |
318
|
|
|
curl_setopt($ch, CURLOPT_POST, 1); |
319
|
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, $data); |
320
|
|
|
} elseif ($method == 'PUT') { |
321
|
|
|
$put = fopen("php://temp", 'r+'); |
322
|
|
|
fwrite($put, $data); |
|
|
|
|
323
|
|
|
fseek($put, 0); |
324
|
|
|
|
325
|
|
|
curl_setopt($ch, CURLOPT_PUT, 1); |
326
|
|
|
curl_setopt($ch, CURLOPT_INFILE, $put); |
327
|
|
|
curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data)); |
|
|
|
|
328
|
|
|
} |
329
|
|
|
|
330
|
|
|
// Follow redirects |
331
|
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
332
|
|
|
|
333
|
|
|
// Set any custom options passed to the request() function |
334
|
|
|
curl_setopt_array($ch, $curlOptions); |
335
|
|
|
|
336
|
|
|
// Run request |
337
|
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); |
338
|
|
|
// See: http://forums.devshed.com/php-development-5/curlopt-timeout-option-for-curl-calls-isn-t-being-obeyed-605642.html |
339
|
|
|
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); // No. seconds to wait while trying to connect. |
340
|
|
|
|
341
|
|
|
// Deal to files, write to them directly and then return |
342
|
|
|
if ($this->mimeProcessor->isOfFileOrImage($this->mime)) { |
343
|
|
|
$tmp_name = @tempnam(TempFolder::getTempFolder('/') . '/' . rand(), 'tmp'); |
344
|
|
|
$fp = fopen($tmp_name, 'w+'); |
|
|
|
|
345
|
|
|
curl_setopt($ch, CURLOPT_HEADER, 0); // We do not want _any_ header info, it corrupts the file data |
346
|
|
|
curl_setopt($ch, CURLOPT_FILE, $fp); // write curl response directly to file, no messing about |
347
|
|
|
curl_exec($ch); |
348
|
|
|
curl_close($ch); |
349
|
|
|
fclose($fp); |
350
|
|
|
|
351
|
|
|
$this->setTmpFileName($tmp_name); // Set a tmp filename |
|
|
|
|
352
|
|
|
|
353
|
|
|
return 'file'; |
|
|
|
|
354
|
|
|
} |
355
|
|
|
|
356
|
|
|
$fullResponseBody = curl_exec($ch); |
357
|
|
|
$curlError = curl_error($ch); |
358
|
|
|
@list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $fullResponseBody), 2); |
359
|
|
|
|
360
|
|
|
if (preg_match("#^HTTP/1.1 100#", $responseHeaders)) { |
361
|
|
|
list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $responseBody), 2); |
362
|
|
|
} |
363
|
|
|
|
364
|
|
|
$responseHeaders = explode("\n", trim($responseHeaders)); |
365
|
|
|
// Shift off the HTTP response code |
366
|
|
|
array_shift($responseHeaders); |
367
|
|
|
|
368
|
|
|
$statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
369
|
|
|
|
370
|
|
|
curl_close($ch); |
371
|
|
|
|
372
|
|
|
if ($curlError !== '' || $statusCode == 0) { |
373
|
|
|
$this->utils->log(" - CURL ERROR: Error: $curlError Status: $statusCode"); |
374
|
|
|
$statusCode = 500; |
375
|
|
|
} |
376
|
|
|
|
377
|
|
|
$response = HTTPResponse::create($responseBody, $statusCode); |
378
|
|
|
|
379
|
|
|
foreach ($responseHeaders as $headerLine) { |
380
|
|
|
if (strpos($headerLine, ":") !== false) { |
381
|
|
|
list($headerName, $headerVal) = explode(":", $headerLine, 2); |
382
|
|
|
$response->addHeader(trim($headerName), trim($headerVal)); |
383
|
|
|
} |
384
|
|
|
} |
385
|
|
|
|
386
|
|
|
$this->utils->log(" - CURL END: {$this->url}. Status: $statusCode. ({$this->mime})"); |
387
|
|
|
return $response; |
388
|
|
|
} |
389
|
|
|
|
390
|
|
|
/** |
391
|
|
|
* |
392
|
|
|
* @param string $tmp |
393
|
|
|
* @return void |
394
|
|
|
*/ |
395
|
|
|
public function setTmpFileName($tmp) |
396
|
|
|
{ |
397
|
|
|
$this->tmpFileName = $tmp; |
398
|
|
|
} |
399
|
|
|
|
400
|
|
|
/** |
401
|
|
|
* |
402
|
|
|
* @return string |
403
|
|
|
*/ |
404
|
|
|
public function getTmpFileName() |
405
|
|
|
{ |
406
|
|
|
return $this->tmpFileName; |
407
|
|
|
} |
408
|
|
|
|
409
|
|
|
/** |
410
|
|
|
* @see {@link StaticSiteMimeProcessor} |
411
|
|
|
* @return boolean |
412
|
|
|
*/ |
413
|
|
|
public function isMimeHTML() |
414
|
|
|
{ |
415
|
|
|
return $this->mimeProcessor->isOfHTML($this->mime); |
416
|
|
|
} |
417
|
|
|
|
418
|
|
|
/** |
419
|
|
|
* @see {@link StaticSiteMimeProcessor} |
420
|
|
|
* @return boolean |
421
|
|
|
*/ |
422
|
|
|
public function isMimeFile() |
423
|
|
|
{ |
424
|
|
|
return $this->mimeProcessor->isOfFile($this->mime); |
425
|
|
|
} |
426
|
|
|
|
427
|
|
|
/** |
428
|
|
|
* @see {@link StaticSiteMimeProcessor} |
429
|
|
|
* @return boolean |
430
|
|
|
*/ |
431
|
|
|
public function isMimeImage() |
432
|
|
|
{ |
433
|
|
|
return $this->mimeProcessor->isOfImage($this->mime); |
434
|
|
|
} |
435
|
|
|
|
436
|
|
|
/** |
437
|
|
|
* @see {@link StaticSiteMimeProcessor} |
438
|
|
|
* @return boolean |
439
|
|
|
*/ |
440
|
|
|
public function isMimeFileOrImage() |
441
|
|
|
{ |
442
|
|
|
return $this->mimeProcessor->isOfFileOrImage($this->mime); |
443
|
|
|
} |
444
|
|
|
|
445
|
|
|
/** |
446
|
|
|
* Pre-process the content so phpQuery can parse it without violently barfing. |
447
|
|
|
* |
448
|
|
|
* @return void |
449
|
|
|
*/ |
450
|
|
|
public function prepareContent(): void |
451
|
|
|
{ |
452
|
|
|
// Trim it |
453
|
|
|
$this->content = trim($this->content ?? ''); |
454
|
|
|
|
455
|
|
|
// Ensure the content begins with the 'html' tag |
456
|
|
|
if (stripos($this->content, '<html') === false) { |
457
|
|
|
$this->content = '<html>' . $this->content; |
458
|
|
|
$this->utils->log('Warning: content was missing opening "<html>" tag.'); |
459
|
|
|
} |
460
|
|
|
} |
461
|
|
|
} |
462
|
|
|
|