Completed
Push — v2 ( dad8a3...5e546d )
by Joschi
06:58
created

DocumentFactory::isAcceptableError()   B

Complexity

Conditions 5
Paths 3

Size

Total Lines 17
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 5

Importance

Changes 0
Metric Value
cc 5
dl 0
loc 17
rs 8.8571
c 0
b 0
f 0
eloc 8
nc 3
nop 1
ccs 8
cts 8
cp 1
crap 5
1
<?php
2
3
/**
4
 * micrometa
5
 *
6
 * @category Jkphl
7
 * @package Jkphl\Micrometa
8
 * @subpackage Jkphl\Micrometa\Infrastructure\Factory
9
 * @author Joschi Kuphal <[email protected]> / @jkphl
10
 * @copyright Copyright © 2017 Joschi Kuphal <[email protected]> / @jkphl
11
 * @license http://opensource.org/licenses/MIT The MIT License (MIT)
12
 */
13
14
/***********************************************************************************
15
 *  The MIT License (MIT)
16
 *
17
 *  Copyright © 2017 Joschi Kuphal <[email protected]> / @jkphl
18
 *
19
 *  Permission is hereby granted, free of charge, to any person obtaining a copy of
20
 *  this software and associated documentation files (the "Software"), to deal in
21
 *  the Software without restriction, including without limitation the rights to
22
 *  use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
23
 *  the Software, and to permit persons to whom the Software is furnished to do so,
24
 *  subject to the following conditions:
25
 *
26
 *  The above copyright notice and this permission notice shall be included in all
27
 *  copies or substantial portions of the Software.
28
 *
29
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
31
 *  FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
32
 *  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
33
 *  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
34
 *  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35
 ***********************************************************************************/
36
37
namespace Jkphl\Micrometa\Infrastructure\Factory;
38
39
use Guzzle\Common\Exception\InvalidArgumentException as GuzzleInvalidArgumentException;
40
use Guzzle\Common\Exception\RuntimeException as GuzzleRuntimeException;
41
use Guzzle\Http\Client;
42
use Guzzle\Http\Url;
43
use Jkphl\Micrometa\Ports\Exceptions\InvalidArgumentException;
44
use Jkphl\Micrometa\Ports\Exceptions\RuntimeException;
45
46
/**
47
 * DOM document factory
48
 *
49
 * @package Jkphl\Micrometa
50
 * @subpackage Jkphl\Micrometa\Infrastructure
51
 */
52
class DocumentFactory
53
{
54
    /**
55
     * HTML5 elements
56
     *
57
     * @var array
58
     */
59
    protected static $html5 = [
60
        'a',
61
        'abbr',
62
        'acronym',
63
        'address',
64
        'applet',
65
        'area',
66
        'article',
67
        'aside',
68
        'audio',
69
        'b',
70
        'base',
71
        'basefont',
72
        'bdi',
73
        'bdo',
74
        'bgsound',
75
        'big',
76
        'blink',
77
        'blockquote',
78
        'body',
79
        'br',
80
        'button',
81
        'canvas',
82
        'caption',
83
        'center',
84
        'cite',
85
        'code',
86
        'col',
87
        'colgroup',
88
        'content',
89
        'data',
90
        'datalist',
91
        'dd',
92
        'decorator',
93
        'del',
94
        'details',
95
        'dfn',
96
        'dir',
97
        'div',
98
        'dl',
99
        'dt',
100
        'element',
101
        'em',
102
        'embed',
103
        'fieldset',
104
        'figcaption',
105
        'figure',
106
        'font',
107
        'footer',
108
        'form',
109
        'frame',
110
        'frameset',
111
        'h1',
112
        'h2',
113
        'h3',
114
        'h4',
115
        'h5',
116
        'h6',
117
        'head',
118
        'header',
119
        'hgroup',
120
        'hr',
121
        'html',
122
        'i',
123
        'iframe',
124
        'img',
125
        'input',
126
        'ins',
127
        'isindex',
128
        'kbd',
129
        'keygen',
130
        'label',
131
        'legend',
132
        'li',
133
        'link',
134
        'listing',
135
        'main',
136
        'map',
137
        'mark',
138
        'marquee',
139
        'menu',
140
        'menuitem',
141
        'meta',
142
        'meter',
143
        'nav',
144
        'nobr',
145
        'noframes',
146
        'noscript',
147
        'object',
148
        'ol',
149
        'optgroup',
150
        'option',
151
        'output',
152
        'p',
153
        'param',
154
        'plaintext',
155
        'pre',
156
        'progress',
157
        'q',
158
        'rp',
159
        'rt',
160
        'ruby',
161
        's',
162
        'samp',
163
        'script',
164
        'section',
165
        'select',
166
        'shadow',
167
        'small',
168
        'source',
169
        'spacer',
170
        'span',
171
        'strike',
172
        'strong',
173
        'style',
174
        'sub',
175
        'summary',
176
        'sup',
177
        'table',
178
        'tbody',
179
        'td',
180
        'template',
181
        'textarea',
182
        'tfoot',
183
        'th',
184
        'thead',
185
        'time',
186
        'title',
187
        'tr',
188
        'track',
189
        'tt',
190
        'u',
191
        'ul',
192
        'var',
193
        'video',
194
        'wbr',
195
        'xmp'
196
    ];
197
198
    /**
199
     * HTML void elements
200
     *
201
     * @var array
202
     */
203
    protected static $htmlVoid = [
204
        'area',
205
        'base',
206
        'br',
207
        'col',
208
        'embed',
209
        'hr',
210
        'img',
211
        'input',
212
        'keygen',
213
        'link',
214
        'menuitem',
215
        'meta',
216
        'param',
217
        'source',
218
        'track',
219
        'wbr',
220
        'basefont',
221
        'bgsound',
222
        'frame',
223
        'isindex'
224
    ];
225
226
    /**
227
     * Create a DOM document from a URI
228
     *
229
     * @param string $url HTTP / HTTPS URL
230
     * @return \DOMDocument DOM document
231
     */
232 6
    public static function createFromUri($url)
233
    {
234 6
        return extension_loaded('curl') ? self::createViaHttpClient($url) : self::createViaStreamWrapper($url);
235
    }
236
237
    /**
238
     * Create a DOM document using a HTTP client implementation
239
     *
240
     * @param string $url HTTP / HTTPS URL
241
     * @return \DOMDocument DOM document
242
     * @throws RuntimeException If the request wasn't successful
243
     * @throws InvalidArgumentException If an argument was invalid
244
     * @throws RuntimeException If a runtime exception occurred
245
     */
246 5
    protected static function createViaHttpClient($url)
247
    {
248
        try {
249 5
            $guzzleUrl = Url::factory($url);
250 4
            $client = new Client($guzzleUrl, ['timeout' => 10.0]);
251 4
            $request = $client->get($guzzleUrl);
252 4
            $response = $client->send($request);
253 3
            return self::createFromString(strval($response->getBody()));
254
255
            // If an argument was invalid
256 3
        } catch (GuzzleInvalidArgumentException $e) {
257 1
            throw new InvalidArgumentException($e->getMessage(), $e->getCode());
258
259
            // If a runtime exception occurred
260 2
        } catch (GuzzleRuntimeException $e) {
261 1
            throw new RuntimeException($e->getMessage(), $e->getCode());
262
        }
263
    }
264
265
    /**
266
     * Create a DOM document from a string
267
     *
268
     * @param string $str String
269
     * @return \DOMDocument DOM document
270
     */
271 7
    public static function createFromString($str)
272
    {
273 7
        $source = mb_convert_encoding($str, 'HTML-ENTITIES', mb_detect_encoding($str));
274 7
        $dom = new \DOMDocument();
275
276
        // Try to load the source as XML document first, then as HTML document
277 7
        if (!$dom->loadXML($source, LIBXML_NOWARNING | LIBXML_NOERROR)) {
278 7
            libxml_use_internal_errors(true);
279 7
            $dom->loadHTML($source, LIBXML_NOWARNING);
280 7
            $errors = libxml_get_errors();
281 7
            libxml_use_internal_errors(false);
282
283
            // Run through all errors
284
            /** @var \LibXMLError $error */
285 7
            foreach ($errors as $error) {
286 4
                if (!self::isAcceptableError($error)) {
287 1
                    throw new InvalidArgumentException(
288 1
                        sprintf(InvalidArgumentException::INVALID_DATA_SOURCE_STR, trim($error->message)),
289 4
                        InvalidArgumentException::INVALID_DATA_SOURCE
290
                    );
291
                }
292
            }
293
        }
294
295 6
        return $dom;
296
    }
297
298
    /**
299
     * Test whether a parsing error is acceptable
300
     *
301
     * @param \LibXMLError $error Parsing error
302
     * @return bool Acceptable error
303
     */
304 4
    protected static function isAcceptableError(\LibXMLError $error)
305
    {
306
        // If it's an error based on an HTML5 element
307 4
        if (($error->code == 801) &&
308 4
            preg_match('/^tag\s+(\S+)\s+invalid$/', strtolower($error->message), $tag) &&
309 4
            in_array($tag[1], self::$html5)
310
        ) {
311 2
            return true;
312
        }
313
314
        // If it's an error based on a non closing element
315 2
        if (in_array($error->code, [76, 77])) {
316 1
            return true;
317
        }
318
319 1
        return false;
320
    }
321
322
    /**
323
     * Create a DOM document via the PHP stream wrapper
324
     *
325
     * @param string $url URL
326
     * @return \DOMDocument DOM document
327
     */
328 1
    protected static function createViaStreamWrapper($url)
329
    {
330
        $opts = array(
331
            'http' => array(
332
                'method' => 'GET',
333
                'protocol_version' => 1.1,
334
                'user_agent' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.466.4 Safari/534.3',
335
                'max_redirects' => 10,
336
                'timeout' => 120,
337
                'header' => "Accept-language: en\r\n",
338
            )
339 1
        );
340 1
        $context = stream_context_create($opts);
341 1
        $response = @file_get_contents($url, false, $context);
342 1
        return self::createFromString($response);
343
    }
344
}
345