1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Created by PhpStorm. |
4
|
|
|
* @author hashashiyyin [email protected] / [email protected] |
5
|
|
|
* Date: 24/04/24 |
6
|
|
|
* Time: 18:03 |
7
|
|
|
* |
8
|
|
|
*/ |
9
|
|
|
|
10
|
|
|
namespace Matecat\XmlParser; |
11
|
|
|
|
12
|
|
|
use ArrayObject; |
13
|
|
|
use DOMAttr; |
14
|
|
|
use DOMDocument; |
15
|
|
|
use DOMException; |
16
|
|
|
use DOMNode; |
17
|
|
|
use DOMNodeList; |
18
|
|
|
use DOMText; |
19
|
|
|
use Matecat\XmlParser\Exception\InvalidXmlException; |
20
|
|
|
use Matecat\XmlParser\Exception\XmlParsingException; |
21
|
|
|
|
22
|
|
|
abstract class AbstractParser { |
23
|
|
|
|
24
|
|
|
const fragmentDocumentRoot = '_____root'; |
25
|
|
|
const regexpEntity = '/&#x([0-1]{0,1}[0-9A-F]{1,2})/u'; //  |
26
|
|
|
const regexpAscii = '/([\x{00}-\x{1F}\x{7F}]{1})/u'; |
27
|
|
|
protected static $asciiPlaceHoldMap = [ |
28
|
|
|
'00' => [ 'symbol' => 'NULL', 'placeHold' => '', 'numeral' => 0x00 ], |
29
|
|
|
'01' => [ 'symbol' => 'SOH', 'placeHold' => '', 'numeral' => 0x01 ], |
30
|
|
|
'02' => [ 'symbol' => 'STX', 'placeHold' => '', 'numeral' => 0x02 ], |
31
|
|
|
'03' => [ 'symbol' => 'ETX', 'placeHold' => '', 'numeral' => 0x03 ], |
32
|
|
|
'04' => [ 'symbol' => 'EOT', 'placeHold' => '', 'numeral' => 0x04 ], |
33
|
|
|
'05' => [ 'symbol' => 'ENQ', 'placeHold' => '', 'numeral' => 0x05 ], |
34
|
|
|
'06' => [ 'symbol' => 'ACK', 'placeHold' => '', 'numeral' => 0x06 ], |
35
|
|
|
'07' => [ 'symbol' => 'BEL', 'placeHold' => '', 'numeral' => 0x07 ], |
36
|
|
|
'08' => [ 'symbol' => 'BS', 'placeHold' => '', 'numeral' => 0x08 ], |
37
|
|
|
'09' => [ 'symbol' => 'HT', 'placeHold' => '', 'numeral' => 0x09 ], |
38
|
|
|
'0A' => [ 'symbol' => 'LF', 'placeHold' => '', 'numeral' => 0x0A ], |
39
|
|
|
'0B' => [ 'symbol' => 'VT', 'placeHold' => '', 'numeral' => 0x0B ], |
40
|
|
|
'0C' => [ 'symbol' => 'FF', 'placeHold' => '', 'numeral' => 0x0C ], |
41
|
|
|
'0D' => [ 'symbol' => 'CR', 'placeHold' => '', 'numeral' => 0x0D ], |
42
|
|
|
'0E' => [ 'symbol' => 'SO', 'placeHold' => '', 'numeral' => 0x0E ], |
43
|
|
|
'0F' => [ 'symbol' => 'SI', 'placeHold' => '', 'numeral' => 0x0F ], |
44
|
|
|
'10' => [ 'symbol' => 'DLE', 'placeHold' => '', 'numeral' => 0x10 ], |
45
|
|
|
'11' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x11 ], |
46
|
|
|
'12' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x12 ], |
47
|
|
|
'13' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x13 ], |
48
|
|
|
'14' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x14 ], |
49
|
|
|
'15' => [ 'symbol' => 'NAK', 'placeHold' => '', 'numeral' => 0x15 ], |
50
|
|
|
'16' => [ 'symbol' => 'SYN', 'placeHold' => '', 'numeral' => 0x16 ], |
51
|
|
|
'17' => [ 'symbol' => 'ETB', 'placeHold' => '', 'numeral' => 0x17 ], |
52
|
|
|
'18' => [ 'symbol' => 'CAN', 'placeHold' => '', 'numeral' => 0x18 ], |
53
|
|
|
'19' => [ 'symbol' => 'EM', 'placeHold' => '', 'numeral' => 0x19 ], |
54
|
|
|
'1A' => [ 'symbol' => 'SUB', 'placeHold' => '', 'numeral' => 0x1A ], |
55
|
|
|
'1B' => [ 'symbol' => 'ESC', 'placeHold' => '', 'numeral' => 0x1B ], |
56
|
|
|
'1C' => [ 'symbol' => 'FS', 'placeHold' => '', 'numeral' => 0x1C ], |
57
|
|
|
'1D' => [ 'symbol' => 'GS', 'placeHold' => '', 'numeral' => 0x1D ], |
58
|
|
|
'1E' => [ 'symbol' => 'RS', 'placeHold' => '', 'numeral' => 0x1E ], |
59
|
|
|
'1F' => [ 'symbol' => 'US', 'placeHold' => '', 'numeral' => 0x1F ], |
60
|
|
|
'7F' => [ 'symbol' => 'DEL', 'placeHold' => '', 'numeral' => 0x7F ], |
61
|
|
|
]; |
62
|
|
|
|
63
|
|
|
/** |
64
|
|
|
* @var string |
65
|
|
|
*/ |
66
|
|
|
protected $isXmlFragment; |
67
|
|
|
|
68
|
|
|
/** |
69
|
|
|
* @var DOMDocument |
70
|
|
|
*/ |
71
|
|
|
protected $dom; |
72
|
|
|
|
73
|
|
|
protected $elements; |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* @throws InvalidXmlException |
77
|
|
|
* @throws XmlParsingException |
78
|
|
|
*/ |
79
|
|
|
protected function __construct( $xml, $isXmlFragment, $isHtml = false ) { |
80
|
|
|
$xml = $this->removeNotPrintableChars( $xml ); |
81
|
|
|
$this->isXmlFragment = $isXmlFragment; |
82
|
|
|
|
83
|
|
|
$this->dom = XmlDomLoader::load( |
84
|
|
|
$xml, |
85
|
|
|
new Config( |
86
|
|
|
( $isXmlFragment ? self::fragmentDocumentRoot : null ), |
87
|
|
|
$isHtml, |
88
|
|
|
LIBXML_NONET | LIBXML_NOBLANKS |
89
|
|
|
) |
90
|
|
|
); |
91
|
|
|
|
92
|
|
|
$this->elements = new ArrayObject(); |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* We replace not printable chars with a placeholder. |
97
|
|
|
* This because DomDocument cannot handle not printable chars |
98
|
|
|
* |
99
|
|
|
* @param $seg |
100
|
|
|
* |
101
|
|
|
* @return string |
102
|
|
|
*/ |
103
|
|
|
protected function removeNotPrintableChars( $seg ) { |
104
|
|
|
|
105
|
|
|
preg_match_all( self::regexpAscii, $seg, $matches ); |
106
|
|
|
|
107
|
|
|
if ( !empty( $matches[ 1 ] ) ) { |
108
|
|
|
$test_src = $seg; |
109
|
|
|
foreach ( $matches[ 1 ] as $v ) { |
110
|
|
|
$key = sprintf( "%02X", ord( $v ) ); |
111
|
|
|
$hexNum = sprintf( "/(\\x{%s})/u", $key ); |
112
|
|
|
$test_src = preg_replace( $hexNum, self::$asciiPlaceHoldMap[ $key ][ 'placeHold' ], $test_src, 1 ); |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
$seg = $test_src; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
preg_match_all( self::regexpEntity, $seg, $matches ); |
119
|
|
|
|
120
|
|
|
if ( !empty( $matches[ 1 ] ) ) { |
121
|
|
|
$test_src = $seg; |
122
|
|
|
foreach ( $matches[ 1 ] as $v ) { |
123
|
|
|
$byte = sprintf( "%02X", hexdec( $v ) ); |
124
|
|
|
if ( $byte[ 0 ] == '0' ) { |
125
|
|
|
$regexp = '/&#x([' . $byte[ 0 ] . ']?' . $byte[ 1 ] . ');/u'; |
126
|
|
|
} else { |
127
|
|
|
$regexp = '/&#x(' . $byte . ');/u'; |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
$key = sprintf( "%02X", hexdec( $v ) ); |
131
|
|
|
if ( array_key_exists( $key, self::$asciiPlaceHoldMap ) ) { |
132
|
|
|
$test_src = preg_replace( $regexp, self::$asciiPlaceHoldMap[ $key ][ 'placeHold' ], $test_src ); |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
} |
136
|
|
|
|
137
|
|
|
$seg = $test_src; |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
return $seg; |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* @param DOMNodeList $elementList |
145
|
|
|
* @param ArrayObject $elements |
146
|
|
|
* |
147
|
|
|
* @return ArrayObject |
148
|
|
|
*/ |
149
|
|
|
protected function mapElements( DOMNodeList $elementList, ArrayObject $elements ) { |
150
|
|
|
|
151
|
|
|
for ( $i = 0; $i < $elementList->length; $i++ ) { |
152
|
|
|
|
153
|
|
|
$element = $elementList->item( $i ); |
154
|
|
|
|
155
|
|
|
$elements[] = (object)[ |
156
|
|
|
'node' => $this->dom->saveXML( $element ), |
157
|
|
|
'tagName' => $element->nodeName, |
158
|
|
|
'attributes' => $this->getAttributes( $element ), |
159
|
|
|
'text' => ( $element instanceof DOMText ) ? $element->textContent : null, |
160
|
|
|
'self_closed' => ( $element instanceof DOMText ) ? null : !$element->hasChildNodes(), |
161
|
|
|
'has_children' => ( $element instanceof DOMText ) ? null : $element->hasChildNodes(), |
162
|
|
|
'inner_html' => $element->hasChildNodes() ? $this->mapElements( $element->childNodes, new ArrayObject() ) : new ArrayObject() |
163
|
|
|
]; |
164
|
|
|
|
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
return $elements; |
168
|
|
|
|
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
/** |
172
|
|
|
* @param DOMNode $element |
173
|
|
|
* |
174
|
|
|
* @return array |
175
|
|
|
*/ |
176
|
|
|
protected function getAttributes( DOMNode $element ) { |
177
|
|
|
|
178
|
|
|
if ( !$element->hasAttributes() ) { |
179
|
|
|
return []; |
180
|
|
|
} |
181
|
|
|
|
182
|
|
|
$attributesMap = []; |
183
|
|
|
|
184
|
|
|
/** |
185
|
|
|
* @var DOMAttr $attr |
186
|
|
|
*/ |
187
|
|
|
foreach ( $element->attributes as $attr ) { |
188
|
|
|
$attributesMap[ $attr->nodeName ] = $attr->nodeValue; |
189
|
|
|
} |
190
|
|
|
|
191
|
|
|
return $attributesMap; |
192
|
|
|
|
193
|
|
|
} |
194
|
|
|
|
195
|
|
|
/** |
196
|
|
|
* @return ArrayObject |
197
|
|
|
* @throws DOMException |
198
|
|
|
*/ |
199
|
|
|
protected function extractNodes() { |
200
|
|
|
|
201
|
|
|
$htmlNodeList = $this->getNodeListFromQueryPath(); |
202
|
|
|
|
203
|
|
|
if ( !$htmlNodeList instanceof DOMNodeList ) { |
|
|
|
|
204
|
|
|
throw new DOMException( 'Bad DOMNodeList' ); |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
if ( $this->isXmlFragment && $htmlNodeList->item( 0 )->nodeName == self::fragmentDocumentRoot ) { |
208
|
|
|
// there is a fake root node, skip the first element end start with child nodes |
209
|
|
|
$this->mapElements( $htmlNodeList->item( 0 )->childNodes, $this->elements ); |
210
|
|
|
} else { |
211
|
|
|
$this->mapElements( $htmlNodeList, $this->elements ); |
212
|
|
|
} |
213
|
|
|
|
214
|
|
|
return $this->elements; |
215
|
|
|
|
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
/** |
219
|
|
|
* @return DOMNodeList |
220
|
|
|
*/ |
221
|
|
|
abstract protected function getNodeListFromQueryPath(); |
222
|
|
|
|
223
|
|
|
} |