| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                <?php  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                namespace arc\html;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                class Parser  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                    public $options = [  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                        'libxml_options' => 0  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                    ];  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    11
                 | 
                                    
                             5                          | 
                
                 | 
                    public function __construct( $options = array() )  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    13
                 | 
                                    
                             5                          | 
                
                 | 
                        $optionList = [ 'libxml_options' ];  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    14
                 | 
                                    
                             5                          | 
                
                 | 
                        foreach( $options as $option => $optionValue ) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                            if ( in_array( $option, $optionList ) ) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                                $this->{$option} = $optionValue; | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                            }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    18
                 | 
                                    
                             5                          | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    19
                 | 
                                    
                             5                          | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    21
                 | 
                                    
                             5                          | 
                
                 | 
                    public function parse( $html, $encoding = 'UTF-8' )  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    23
                 | 
                                    
                             5                          | 
                
                 | 
                        if ( !$html ) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    24
                 | 
                                    
                                                     | 
                
                 | 
                            return \arc\html\Proxy( null );  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    26
                 | 
                                    
                             5                          | 
                
                 | 
                        if ( $html instanceof Proxy ) { // already parsed | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                            return $html;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    28
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    29
                 | 
                                    
                             5                          | 
                
                 | 
                        $html = (string) $html;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    30
                 | 
                                    
                             5                          | 
                
                 | 
                        if ( stripos($html, '<html>')!==false ) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    31
                 | 
                                    
                             5                          | 
                
                 | 
                            return $this->parseFull( $html, $encoding );  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                        } else { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                            return $this->parsePartial( $html, $encoding );  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    36
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                    private function parsePartial( $html, $encoding = 'UTF-8')  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    38
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                        $result = $this->parseFull( '<div id="ArcPartialHTML">'.$html.'</div>', $encoding );  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                        if ( $result ) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    41
                 | 
                                    
                                                     | 
                
                 | 
                            $result = new \arc\html\Proxy( $result->find('#ArcPartialHTML')[0]->children(), $this ); | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                        } else { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                            throw new \arc\Exception('parse error'); | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                        return $result;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    46
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    48
                 | 
                                    
                             5                          | 
                
                 | 
                    private function parseFull( $html, $encoding = 'UTF-8')  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    50
                 | 
                                    
                             5                          | 
                
                 | 
                        $dom = new \DomDocument('1.0', $encoding); | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    51
                 | 
                                    
                             5                          | 
                
                 | 
                        $prefix = '';  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    52
                 | 
                                    
                             5                          | 
                
                 | 
                        if ( isset($encoding) ) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                            $prefix  = <<<EOS  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    54
                 | 
                                    
                                                     | 
                
                 | 
                <head id='ar_html_parser_encoding_header'>  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    55
                 | 
                                    
                                                     | 
                
                 | 
                <meta http-equiv="content-type" content="text/html; charset=$encoding">  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    56
                 | 
                                    
                             2                          | 
                
                 | 
                </head>  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    57
                 | 
                                    
                             2                          | 
                
                 | 
                EOS;  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    58
                 | 
                                    
                             2                          | 
                
                 | 
                        }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    59
                 | 
                                    
                             5                          | 
                
                 | 
                        libxml_disable_entity_loader(); // prevents XXE attacks  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    60
                 | 
                                    
                             5                          | 
                
                 | 
                        $prevErrorSetting = libxml_use_internal_errors(true);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    61
                 | 
                                    
                             5                          | 
                
                 | 
                        if ( $dom->loadHTML( $prefix . $html, $this->options['libxml_options'] ) ) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    62
                 | 
                                    
                             5                          | 
                
                 | 
                            if ( isset($encoding) ) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    63
                 | 
                                    
                             2                          | 
                
                 | 
                                $elm  = $dom->getElementById('ar_html_parser_encoding_header'); | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    64
                 | 
                                    
                             2                          | 
                
                 | 
                                if ( isset($elm) ) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    65
                 | 
                                    
                             2                          | 
                
                 | 
                                    $elm->parentNode->removeChild($elm);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    66
                 | 
                                    
                             2                          | 
                
                 | 
                                }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    67
                 | 
                                    
                             2                          | 
                
                 | 
                            }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    68
                 | 
                                    
                             5                          | 
                
                 | 
                            libxml_use_internal_errors( $prevErrorSetting );  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    69
                 | 
                                    
                             5                          | 
                
                 | 
                            return new \arc\html\Proxy( simplexml_import_dom( $dom ), $this );  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    70
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                        $errors = libxml_get_errors();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                        libxml_clear_errors();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                        libxml_use_internal_errors( $prevErrorSetting );  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    74
                 | 
                                    
                                                     | 
                
                 | 
                        $message = 'Incorrect html passed.';  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    75
                 | 
                                    
                                                     | 
                
                 | 
                        foreach ( $errors as $error ) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                            $message .= "\nline: ".$error->line."; column: ".$error->column."; ".$error->message;  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    78
                 | 
                                    
                                                     | 
                
                 | 
                        throw new \arc\Exception( $message, \arc\exceptions::ILLEGAL_ARGUMENT );  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    79
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    81
                 | 
                                    
                                                     | 
                
                 | 
                }  | 
            
            
                                                        
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                 |