1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace arc\html; |
4
|
|
|
|
5
|
|
|
class Parser |
6
|
|
|
{ |
7
|
|
|
public $options = [ |
8
|
|
|
'libxml_options' => 0 |
9
|
|
|
]; |
10
|
|
|
|
11
|
4 |
|
public function __construct( $options = array() ) |
12
|
|
|
{ |
13
|
4 |
|
$optionList = [ 'libxml_options' ]; |
14
|
4 |
|
foreach( $options as $option => $optionValue ) { |
15
|
|
|
if ( in_array( $option, $optionList ) ) { |
16
|
|
|
$this->{$option} = $optionValue; |
17
|
|
|
} |
18
|
|
|
} |
19
|
4 |
|
} |
20
|
|
|
|
21
|
4 |
|
public function parse( $html, $encoding = null ) |
22
|
|
|
{ |
23
|
4 |
|
if ( !$html ) { |
24
|
|
|
return \arc\html\Proxy( null ); |
25
|
|
|
} |
26
|
4 |
|
if ( $html instanceof Proxy ) { // already parsed |
27
|
|
|
return $html; |
28
|
|
|
} |
29
|
4 |
|
$html = (string) $html; |
30
|
4 |
|
if ( stripos($html, '<body>')!==false ) { |
31
|
4 |
|
return $this->parseFull( $html, $encoding ); |
32
|
|
|
} else { |
33
|
1 |
|
return $this->parsePartial( $html, $encoding ); |
34
|
|
|
} |
35
|
|
|
} |
36
|
|
|
|
37
|
1 |
|
private function parsePartial( $html, $encoding ) |
38
|
|
|
{ |
39
|
1 |
|
$result = $this->parseFull( '<body id="ArcPartialHTML">'.$html.'</body>', $encoding ); |
40
|
1 |
|
if ( $result ) { |
41
|
1 |
|
$result = new \arc\html\Proxy( $result->find('#ArcPartialHTML')[0]->children(), $this ); |
42
|
|
|
// $result = new \arc\html\Proxy( $result->children(), $this ); |
|
|
|
|
43
|
|
|
} else { |
44
|
|
|
throw new \arc\Exception('parse error'); |
45
|
|
|
} |
46
|
1 |
|
return $result; |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
private function throwError($prevErrorSetting) |
50
|
|
|
{ |
51
|
|
|
$errors = libxml_get_errors(); |
52
|
|
|
libxml_clear_errors(); |
53
|
|
|
libxml_use_internal_errors( $prevErrorSetting ); |
54
|
|
|
$message = 'Incorrect html passed.'; |
55
|
|
|
foreach ( $errors as $error ) { |
56
|
|
|
$message .= "\nline: ".$error->line."; column: ".$error->column."; ".$error->message; |
57
|
|
|
} |
58
|
|
|
throw new \arc\Exception( $message, \arc\exceptions::ILLEGAL_ARGUMENT ); |
59
|
|
|
} |
60
|
|
|
|
61
|
1 |
|
private function insertEncoding($html, $encoding) |
62
|
|
|
{ |
63
|
1 |
|
$meta = '<meta id="ArcTempEncoding" http-equiv="content-type" content="text/html; charset="'. htmlspecialchars($encoding) .'">'; |
64
|
1 |
|
if ( preg_match('/<head([^>]*)>/i', $html) ) { |
65
|
1 |
|
$html = preg_replace('/<head([^>]*)>/i', '<head\\1>'.$meta, $html); |
66
|
1 |
|
} else if ( preg_match('/<body([^>]*)>/i', $html) ) { |
67
|
1 |
|
$html = preg_replace('/<body([^>]*)>/i', '<head>'.$meta.'</head><body\\1>', $html); |
68
|
|
|
} else { |
69
|
|
|
$html = $meta.$html; |
70
|
|
|
} |
71
|
1 |
|
return $html; |
72
|
|
|
} |
73
|
|
|
|
74
|
1 |
|
private function removeEncoding( $dom ) |
75
|
|
|
{ |
76
|
1 |
|
$meta = $dom->getElementById('ArcTempEncoding'); |
77
|
1 |
|
$meta->parentNode->removeChild($meta); |
78
|
1 |
|
} |
79
|
|
|
|
80
|
4 |
|
private function parseFull( $html, $encoding ) |
81
|
|
|
{ |
82
|
4 |
|
$dom = new \DomDocument(); |
83
|
4 |
|
libxml_disable_entity_loader(); // prevents XXE attacks |
84
|
4 |
|
$prevErrorSetting = libxml_use_internal_errors(true); |
85
|
4 |
|
if ( $encoding ) { |
86
|
1 |
|
$html = $this->insertEncoding($html, $encoding); |
87
|
|
|
} |
88
|
4 |
|
if ( !$dom->loadHTML( $html, $this->options['libxml_options'] ) ) { |
89
|
|
|
$this->throwError($prevErrorSetting); |
90
|
|
|
} |
91
|
4 |
|
if ( $encoding ) { |
92
|
1 |
|
$this->removeEncoding( $dom ); |
93
|
|
|
} |
94
|
4 |
|
libxml_use_internal_errors( $prevErrorSetting ); |
95
|
4 |
|
return new \arc\html\Proxy( simplexml_import_dom( $dom ), $this ); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
} |
99
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.