StaticSiteContentExtractor::setTmpFileName()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
namespace PhpTek\Exodus\Tool;
4
5
use phpQuery;
6
use DOMDocument;
7
use DOMElement;
8
use PhpTek\Exodus\Tool\StaticSiteMimeProcessor;
9
use PhpTek\Exodus\Tool\StaticSiteUtils;
10
use SilverStripe\Core\Injector\Injectable;
11
use SilverStripe\Control\Director;
12
use SilverStripe\Control\HTTPResponse;
13
use SilverStripe\Core\Config\Configurable;
14
use SilverStripe\Core\Convert;
15
use SilverStripe\Core\TempFolder;
16
17
/**
18
 * This tool uses a combination of cURL and phpQuery to extract content from a URL.
19
 *
20
 * The URL is first downloaded using cURL, and then passed into phpQuery for processing.
21
 * Given a set of fieldnames and CSS selectors corresponding to them, a map of content
22
 * fields will be returned.
23
 *
24
 * If the URL represents a file-based Mime-Type, a Silverstripe `File` object is created and the
25
 * physical file it represents can then be post-processed and saved to the dstabase and F/S.
26
 *
27
 * @package phptek/silverstripe-exodus
28
 * @author Sam Minee <[email protected]>
29
 * @author Russell Michell <[email protected]>
30
 */
31
class StaticSiteContentExtractor
32
{
33
    use Injectable;
34
    use Configurable;
35
36
    /**
37
     *
38
     * @var string
39
     */
40
    protected $url = null;
41
42
    /**
43
     *
44
     * @var string
45
     */
46
    protected $mime = null;
47
48
    /**
49
     * This is an HTML page's source markup.
50
     *
51
     * @var string
52
     */
53
    protected $content = null;
54
55
    /**
56
     *
57
     * @var phpQueryObject
0 ignored issues
show
Bug introduced by
The type PhpTek\Exodus\Tool\phpQueryObject was not found. Did you mean phpQueryObject? If so, make sure to prefix the type with \.
Loading history...
58
     */
59
    protected $phpQuery = null;
60
61
    /**
62
     *
63
     * @var string
64
     */
65
    protected $tmpFileName = '';
66
67
    /**
68
     * "Caches" the mime-processor for use throughout
69
     *
70
     * @var StaticSiteMimeProcessor
71
     */
72
    protected $mimeProcessor;
73
74
    /**
75
     * Holds the StaticSiteUtils object on construct
76
     *
77
     * @var Object
78
     */
79
    protected $utils;
80
81
    /**
82
     * Create a StaticSiteContentExtractor for a single URL/.
83
     *
84
     * @param string $url The absolute URL to extract content from
85
     * @param string $mime The Mime-Type
86
     * @param string $content (Optional. Useful only for crude tests that avoid rigging-up a URL to parse)
87
     * @return void
88
     * @throws \Exception
89
     */
90
    public function __construct($url, $mime, $content = null)
91
    {
92
        $this->url = $url;
93
        $this->mime = $mime;
94
        $this->content = $content;
95
        $this->mimeProcessor = singleton(StaticSiteMimeProcessor::class);
96
        $this->utils = singleton(StaticSiteUtils::class);
97
98
        $this->utils->log(sprintf('Begin extraction for URL: %s and Mime: %s', $this->url, $this->mime));
99
    }
100
101
    /**
102
     * Extract content for map of field => css-selector pairs
103
     *
104
     * @param  array $selectorMap A map of field name => css-selector
105
     * @param  StaticSiteContentItem $item The item to extract
0 ignored issues
show
Bug introduced by
The type PhpTek\Exodus\Tool\StaticSiteContentItem was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
106
     * @return array Map of fieldname => ['selector' => selector, 'content' => field content]
107
     */
108
    public function extractMapAndSelectors($selectorMap, $item): array
109
    {
110
        if (!$this->phpQuery) {
111
            $this->fetchContent();
112
        }
113
114
        $output = [];
115
        foreach ($selectorMap as $fieldName => $extractionRules) {
116
            if (!is_array($extractionRules)) {
117
                $extractionRules = [$extractionRules];
118
            }
119
120
            foreach ($extractionRules as $extractionRule) {
121
                $content = '';
122
123
                if (!is_array($extractionRule)) {
124
                    $extractionRule = ['selector' => $extractionRule];
125
                }
126
127
                if ($this->isMimeHTML()) {
128
                    $cssSelector = $extractionRule['selector'] ?? '';
129
                    $attribute = $extractionRule['attribute'] ?? '';
130
                    $outerHTML = $extractionRule['outerhtml'] ?? false;
131
                    $content = $this->extractField($cssSelector, $attribute, $outerHTML);
132
                } elseif ($this->isMimeFileOrImage()) {
133
                    $content = $item->externalId;
134
                }
135
136
                if (!$content) {
137
                    continue;
138
                }
139
140
                // Further processing
141
                if ($this->isMimeHTML()) {
142
                    $content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content);
143
                }
144
145
                if (!$content) {
146
                    continue;
147
                }
148
149
                if (!empty($extractionRule['plaintext'])) {
150
                    $content = Convert::html2raw($content);
151
                }
152
153
                // We found a match, select that one and ignore any other selectors
154
                $output[$fieldName] = $extractionRule;
155
                $output[$fieldName]['content'] = $content;
156
                break;
157
            }
158
        }
159
160
        return $output;
161
    }
162
163
    /**
164
     * Extract content for a single css selector
165
     *
166
     * @param  string $cssSelector The CSS selector for which to extract content.
167
     * @param  string $attribute   If set, the value will be from this HTML attribute.
168
     * @param  bool   $outherHTML  Should we return the full HTML markup of the whole field?
169
     * @return string The content for the passed $cssSelector.
170
     */
171
    public function extractField(string $cssSelector, string $attribute = '', bool $outerHTML = false): string
172
    {
173
        if (!$this->phpQuery) {
174
            // Sets $this->phpQuery - weird pattern
175
            $this->fetchContent();
176
        }
177
178
        $elements = $this->phpQuery[$cssSelector];
179
180
        // @todo temporary workaround for File objects
181
        if (!$elements) {
182
            return '';
183
        }
184
185
        // just return the inner HTML for this node
186
        if (!$outerHTML || !$attribute) {
187
            return trim($elements->html());
188
        }
189
190
        $result = '';
191
        foreach ($elements as $element) {
192
            // Get the full html for this element
193
            if ($outerHTML) {
194
                $result .= $this->getOuterHTML($element);
195
            }
196
            // Get the value of an attribute
197
            elseif ($attribute && trim($element->getAttribute($attribute))) {
198
                $result .= ($element->getAttribute($attribute)) . PHP_EOL;
199
            }
200
        }
201
202
        return trim($result);
203
    }
204
205
    /**
206
     * Strip away content from $content that matches one or many css selectors.
207
     *
208
     * @param array $excludeSelectors
209
     * @param string $parentSelector
210
     * @param string $content
211
     * @return string
212
     */
213
    protected function excludeContent($excludeSelectors, $parentSelector, $content)
214
    {
215
        if (!$excludeSelectors) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $excludeSelectors of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
216
            return $content;
217
        }
218
219
        foreach ($excludeSelectors as $excludeSelector) {
220
            if (!trim($excludeSelector)) {
221
                continue;
222
            }
223
            $element = $this->phpQuery[$parentSelector . ' ' . $excludeSelector];
224
            if ($element) {
225
                $remove = $element->htmlOuter();
226
                $content = str_replace($remove, '', $content);
227
                $this->utils->log(' - Excluded content from "' . $parentSelector . ' ' . $excludeSelector . '"');
228
            }
229
        }
230
        return $content;
231
    }
232
233
    /**
234
     * Get the full HTML of the element and its children
235
     *
236
     * @param DOMElement $element
237
     * @return string
238
     */
239
    protected function getOuterHTML(DOMElement $element)
240
    {
241
        $doc = new DOMDocument();
242
        $doc->formatOutput = false;
243
        $doc->preserveWhiteSpace = true;
244
        $doc->substituteEntities = false;
245
        $doc->appendChild($doc->importNode($element, true));
246
        return $doc->saveHTML();
247
    }
248
249
    /**
250
     *
251
     * @return string
252
     */
253
    public function getContent()
254
    {
255
        return $this->content;
256
    }
257
258
    /**
259
     * Fetch the content, initialise $this->content and $this->phpQuery.
260
     * Initialise the latter only if an appropriate mime-type matches.
261
     *
262
     * @return void
263
     * @todo deal-to defaults when $this->mime isn't matched.
264
     */
265
    protected function fetchContent()
266
    {
267
        $this->utils->log(" - Fetching {$this->url} ({$this->mime})");
268
269
        // Set some proxy options for phpCrawler
270
        $curlOpts = singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev());
271
        $response = $this->curlRequest($this->url, "GET", null, null, $curlOpts);
272
273
        if ($response == 'file') {
274
            // Just stop here for files & images
275
            return;
276
        }
277
278
        $this->content = $response->getBody();
279
280
        // Clean up the content so phpQuery doesn't bork
281
        $this->prepareContent();
282
        $this->phpQuery = phpQuery::newDocument($this->content);
0 ignored issues
show
Bug introduced by
$this->content of type string is incompatible with the type unknown_type expected by parameter $markup of phpQuery::newDocument(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

282
        $this->phpQuery = phpQuery::newDocument(/** @scrutinizer ignore-type */ $this->content);
Loading history...
Documentation Bug introduced by
It seems like phpQuery::newDocument($this->content) of type phpQueryObject is incompatible with the declared type PhpTek\Exodus\Tool\phpQueryObject of property $phpQuery.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
283
    }
284
285
    /**
286
     * Use cURL to request a URL, and return a HTTPResponse object (`SiteTree`) or write curl output directly to a tmp file
287
     * ready for uploading to SilverStripe via Upload#load() (`File` and `Image`)
288
     *
289
     * @todo Refactor using Guzzle
290
     * @param string $url
291
     * @param string $method
292
     * @param string $data
293
     * @param string $headers
294
     * @param array $curlOptions
295
     * @return boolean | HTTPResponse
296
     * @todo Add checks when fetching multi Mb images to ignore anything over 2Mb??
297
     */
298
    protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = [])
299
    {
300
        $this->utils->log(" - CURL START: {$this->url} ({$this->mime})");
301
302
        $ch = curl_init();
303
        $timeout = 10;
304
305
        curl_setopt($ch, CURLOPT_URL, $url);
306
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
307
        curl_setopt($ch, CURLOPT_USERAGENT, $this->config()->get('user_agent'));
308
        curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);
309
        curl_setopt($ch, CURLOPT_HEADER, 1);
310
        curl_setopt($ch, CURLOPT_TIMEOUT, 120);
311
312
        if ($headers) {
313
            curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
314
        }
315
316
        // Add fields to POST and PUT requests
317
        if ($method == 'POST') {
318
            curl_setopt($ch, CURLOPT_POST, 1);
319
            curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
320
        } elseif ($method == 'PUT') {
321
            $put = fopen("php://temp", 'r+');
322
            fwrite($put, $data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type null; however, parameter $data of fwrite() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

322
            fwrite($put, /** @scrutinizer ignore-type */ $data);
Loading history...
323
            fseek($put, 0);
324
325
            curl_setopt($ch, CURLOPT_PUT, 1);
326
            curl_setopt($ch, CURLOPT_INFILE, $put);
327
            curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data));
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type null; however, parameter $string of strlen() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

327
            curl_setopt($ch, CURLOPT_INFILESIZE, strlen(/** @scrutinizer ignore-type */ $data));
Loading history...
328
        }
329
330
        // Follow redirects
331
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
332
333
        // Set any custom options passed to the request() function
334
        curl_setopt_array($ch, $curlOptions);
335
336
        // Run request
337
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
338
        // See: http://forums.devshed.com/php-development-5/curlopt-timeout-option-for-curl-calls-isn-t-being-obeyed-605642.html
339
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);     // No. seconds to wait while trying to connect.
340
341
        // Deal to files, write to them directly and then return
342
        if ($this->mimeProcessor->isOfFileOrImage($this->mime)) {
343
            $tmp_name = @tempnam(TempFolder::getTempFolder('/') . '/' . rand(), 'tmp');
344
            $fp = fopen($tmp_name, 'w+');
0 ignored issues
show
Bug introduced by
It seems like $tmp_name can also be of type false; however, parameter $filename of fopen() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

344
            $fp = fopen(/** @scrutinizer ignore-type */ $tmp_name, 'w+');
Loading history...
345
            curl_setopt($ch, CURLOPT_HEADER, 0); // We do not want _any_ header info, it corrupts the file data
346
            curl_setopt($ch, CURLOPT_FILE, $fp); // write curl response directly to file, no messing about
347
            curl_exec($ch);
348
            curl_close($ch);
349
            fclose($fp);
350
351
            $this->setTmpFileName($tmp_name); // Set a tmp filename
0 ignored issues
show
Bug introduced by
It seems like $tmp_name can also be of type false; however, parameter $tmp of PhpTek\Exodus\Tool\Stati...actor::setTmpFileName() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

351
            $this->setTmpFileName(/** @scrutinizer ignore-type */ $tmp_name); // Set a tmp filename
Loading history...
352
353
            return 'file';
0 ignored issues
show
Bug Best Practice introduced by
The expression return 'file' returns the type string which is incompatible with the documented return type SilverStripe\Control\HTTPResponse|boolean.
Loading history...
354
        }
355
356
        $fullResponseBody = curl_exec($ch);
357
        $curlError = curl_error($ch);
358
        @list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $fullResponseBody), 2);
359
360
        if (preg_match("#^HTTP/1.1 100#", $responseHeaders)) {
361
            list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $responseBody), 2);
362
        }
363
364
        $responseHeaders = explode("\n", trim($responseHeaders));
365
        // Shift off the HTTP response code
366
        array_shift($responseHeaders);
367
368
        $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
369
370
        curl_close($ch);
371
372
        if ($curlError !== '' || $statusCode == 0) {
373
            $this->utils->log(" - CURL ERROR: Error: $curlError Status: $statusCode");
374
            $statusCode = 500;
375
        }
376
377
        $response = HTTPResponse::create($responseBody, $statusCode);
378
379
        foreach ($responseHeaders as $headerLine) {
380
            if (strpos($headerLine, ":") !== false) {
381
                list($headerName, $headerVal) = explode(":", $headerLine, 2);
382
                $response->addHeader(trim($headerName), trim($headerVal));
383
            }
384
        }
385
386
        $this->utils->log(" - CURL END: {$this->url}. Status: $statusCode. ({$this->mime})");
387
        return $response;
388
    }
389
390
    /**
391
     *
392
     * @param string $tmp
393
     * @return void
394
     */
395
    public function setTmpFileName($tmp)
396
    {
397
        $this->tmpFileName = $tmp;
398
    }
399
400
    /**
401
     *
402
     * @return string
403
     */
404
    public function getTmpFileName()
405
    {
406
        return $this->tmpFileName;
407
    }
408
409
    /**
410
     * @see {@link StaticSiteMimeProcessor}
411
     * @return boolean
412
     */
413
    public function isMimeHTML()
414
    {
415
        return $this->mimeProcessor->isOfHTML($this->mime);
416
    }
417
418
    /**
419
     * @see {@link StaticSiteMimeProcessor}
420
     * @return boolean
421
     */
422
    public function isMimeFile()
423
    {
424
        return $this->mimeProcessor->isOfFile($this->mime);
425
    }
426
427
    /**
428
     * @see {@link StaticSiteMimeProcessor}
429
     * @return boolean
430
     */
431
    public function isMimeImage()
432
    {
433
        return $this->mimeProcessor->isOfImage($this->mime);
434
    }
435
436
    /**
437
     * @see {@link StaticSiteMimeProcessor}
438
     * @return boolean
439
     */
440
    public function isMimeFileOrImage()
441
    {
442
        return $this->mimeProcessor->isOfFileOrImage($this->mime);
443
    }
444
445
    /**
446
     * Pre-process the content so phpQuery can parse it without violently barfing.
447
     *
448
     * @return void
449
     */
450
    public function prepareContent(): void
451
    {
452
        // Trim it
453
        $this->content = trim($this->content ?? '');
454
455
        // Ensure the content begins with the 'html' tag
456
        if (stripos($this->content, '<html') === false) {
457
            $this->content = '<html>' . $this->content;
458
            $this->utils->log('Warning: content was missing opening "<html>" tag.');
459
        }
460
    }
461
}
462