Completed
Pull Request — master (#4)
by Auke
01:53
created

Parser::removeEncoding()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 1 Features 0
Metric Value
c 1
b 1
f 0
dl 0
loc 5
ccs 0
cts 0
cp 0
rs 9.4285
cc 1
eloc 3
nc 1
nop 2
crap 2
1
<?php
2
3
namespace arc\html;
4
5
class Parser 
6
{
7
    public $options = [
8
        'libxml_options' => 0
9
    ];
10
11 3
    public function __construct( $options = array() ) 
12
    {
13 3
        $optionList = [ 'libxml_options' ];
14 3
        foreach( $options as $option => $optionValue ) {
15
            if ( in_array( $option, $optionList ) ) {
16
                $this->{$option} = $optionValue;
17
            }
18 3
        }
19 3
    }
20
21 3
    public function parse( $html, $encoding = null ) 
22
    {
23 3
        if ( !$html ) {
24
            return \arc\html\Proxy( null );
25
        }
26 3
        if ( $html instanceof Proxy ) { // already parsed
27
            return $html;
28
        }
29 3
        $html = (string) $html;
30 3
        if ( stripos($html, '<html>')!==false ) {
31 3
            return $this->parseFull( $html, $encoding );
32
        } else {        
33
            return $this->parsePartial( $html, $encoding );
34
        }
35
    }
36
37
    private function parsePartial( $html, $encoding ) 
38
    {
39
        $result = $this->parseFull( '<div id="ArcPartialHTML">'.$html.'</div>', $encoding );
40
        if ( $result ) {
41
            $result = new \arc\html\Proxy( $result->find('#ArcPartialHTML')[0]->children(), $this );
42
        } else {
43
            throw new \arc\Exception('parse error');
44
        }
45
        return $result;
46
    }
47
48 3
    private function throwError($prevErrorSetting)
49
    {
50 3
            $errors = libxml_get_errors();
51 3
            libxml_clear_errors();
52 3
            libxml_use_internal_errors( $prevErrorSetting );
53 3
            $message = 'Incorrect html passed.';
54 3
            foreach ( $errors as $error ) {
55 3
                $message .= "\nline: ".$error->line."; column: ".$error->column."; ".$error->message;
56
            }
57
            throw new \arc\Exception( $message, \arc\exceptions::ILLEGAL_ARGUMENT );
58
    }
59
60
    private function insertEncoding($html, $encoding)
61
    {
62
        $meta = '<meta id="ArcTempEncoding" charset="'.htmlspecialchars($encoding).'">';
63
        if ( preg_match('/<head([^>]*)>/i', $html) ) {
64
            $html = preg_replace('/<head([^>]*)>/i', '<head\\1>'.$meta, $html);
65
        } else if ( preg_match('/<body([^>]*)>/i') ) {
66
            $html = preg_replace('/<body([^>]*)>/i', '<head>'.$meta.'</head><body\\1>', $html);
67
        } else {
68
            $html = $meta.$html;
69
        }
70
        return $html;
71
    }
72
73
    private function removeEncoding( $dom, $encoding)
0 ignored issues
show
Unused Code introduced by
The parameter $encoding is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
74
    {
75
        $meta = $dom->getElementById('ArcTempEncoding');
76
        $meta->parentNode->removeChild($meta);
77
    }
78
79
    private function parseFull( $html, $encoding ) 
80
    {
81
        $dom = new \DomDocument();
82
        libxml_disable_entity_loader(); // prevents XXE attacks
83
        $prevErrorSetting = libxml_use_internal_errors(true);
84
        if ( $encoding ) {
85
            $html = $this->insertEncoding($html, $encoding);
86
        }
87
        if ( !$dom->loadHTML( $html, $this->options['libxml_options'] ) ) {
88
            $this->throwError($prevErrorSetting);
89
        }
90
        if ( $encoding ) {
91
            $this->removeEncoding($dom, $encoding);
92
        }
93
        libxml_use_internal_errors( $prevErrorSetting );
94
        return new \arc\html\Proxy( simplexml_import_dom( $dom ), $this );
95
    }
96
97
}