AbstractParser::getAttributes()   A
last analyzed

Complexity

Conditions 3
Paths 3

Size

Total Lines 16
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 3
eloc 6
c 1
b 1
f 0
nc 3
nop 1
dl 0
loc 16
rs 10
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author hashashiyyin [email protected] / [email protected]
5
 * Date: 24/04/24
6
 * Time: 18:03
7
 *
8
 */
9
10
namespace Matecat\XmlParser;
11
12
use ArrayObject;
13
use DOMAttr;
14
use DOMDocument;
15
use DOMException;
16
use DOMNode;
17
use DOMNodeList;
18
use DOMText;
19
use Matecat\XmlParser\Exception\InvalidXmlException;
20
use Matecat\XmlParser\Exception\XmlParsingException;
21
22
abstract class AbstractParser {
23
24
    const fragmentDocumentRoot = '_____root';
25
    const regexpEntity         = '/&#x([0-1]{0,1}[0-9A-F]{1,2})/u'; //&#x1E;  &#xE;
26
    const regexpAscii          = '/([\x{00}-\x{1F}\x{7F}]{1})/u';
27
    protected static $asciiPlaceHoldMap = [
28
            '00' => [ 'symbol' => 'NULL', 'placeHold' => '', 'numeral' => 0x00 ],
29
            '01' => [ 'symbol' => 'SOH', 'placeHold' => '', 'numeral' => 0x01 ],
30
            '02' => [ 'symbol' => 'STX', 'placeHold' => '', 'numeral' => 0x02 ],
31
            '03' => [ 'symbol' => 'ETX', 'placeHold' => '', 'numeral' => 0x03 ],
32
            '04' => [ 'symbol' => 'EOT', 'placeHold' => '', 'numeral' => 0x04 ],
33
            '05' => [ 'symbol' => 'ENQ', 'placeHold' => '', 'numeral' => 0x05 ],
34
            '06' => [ 'symbol' => 'ACK', 'placeHold' => '', 'numeral' => 0x06 ],
35
            '07' => [ 'symbol' => 'BEL', 'placeHold' => '', 'numeral' => 0x07 ],
36
            '08' => [ 'symbol' => 'BS', 'placeHold' => '', 'numeral' => 0x08 ],
37
            '09' => [ 'symbol' => 'HT', 'placeHold' => '', 'numeral' => 0x09 ],
38
            '0A' => [ 'symbol' => 'LF', 'placeHold' => '', 'numeral' => 0x0A ],
39
            '0B' => [ 'symbol' => 'VT', 'placeHold' => '', 'numeral' => 0x0B ],
40
            '0C' => [ 'symbol' => 'FF', 'placeHold' => '', 'numeral' => 0x0C ],
41
            '0D' => [ 'symbol' => 'CR', 'placeHold' => '', 'numeral' => 0x0D ],
42
            '0E' => [ 'symbol' => 'SO', 'placeHold' => '', 'numeral' => 0x0E ],
43
            '0F' => [ 'symbol' => 'SI', 'placeHold' => '', 'numeral' => 0x0F ],
44
            '10' => [ 'symbol' => 'DLE', 'placeHold' => '', 'numeral' => 0x10 ],
45
            '11' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x11 ],
46
            '12' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x12 ],
47
            '13' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x13 ],
48
            '14' => [ 'symbol' => 'DC', 'placeHold' => '', 'numeral' => 0x14 ],
49
            '15' => [ 'symbol' => 'NAK', 'placeHold' => '', 'numeral' => 0x15 ],
50
            '16' => [ 'symbol' => 'SYN', 'placeHold' => '', 'numeral' => 0x16 ],
51
            '17' => [ 'symbol' => 'ETB', 'placeHold' => '', 'numeral' => 0x17 ],
52
            '18' => [ 'symbol' => 'CAN', 'placeHold' => '', 'numeral' => 0x18 ],
53
            '19' => [ 'symbol' => 'EM', 'placeHold' => '', 'numeral' => 0x19 ],
54
            '1A' => [ 'symbol' => 'SUB', 'placeHold' => '', 'numeral' => 0x1A ],
55
            '1B' => [ 'symbol' => 'ESC', 'placeHold' => '', 'numeral' => 0x1B ],
56
            '1C' => [ 'symbol' => 'FS', 'placeHold' => '', 'numeral' => 0x1C ],
57
            '1D' => [ 'symbol' => 'GS', 'placeHold' => '', 'numeral' => 0x1D ],
58
            '1E' => [ 'symbol' => 'RS', 'placeHold' => '', 'numeral' => 0x1E ],
59
            '1F' => [ 'symbol' => 'US', 'placeHold' => '', 'numeral' => 0x1F ],
60
            '7F' => [ 'symbol' => 'DEL', 'placeHold' => '', 'numeral' => 0x7F ],
61
    ];
62
63
    /**
64
     * @var string
65
     */
66
    protected $isXmlFragment;
67
68
    /**
69
     * @var DOMDocument
70
     */
71
    protected $dom;
72
73
    protected $elements;
74
75
    /**
76
     * @throws InvalidXmlException
77
     * @throws XmlParsingException
78
     */
79
    protected function __construct( $xml, $isXmlFragment, $isHtml = false ) {
80
        $xml                 = $this->removeNotPrintableChars( $xml );
81
        $this->isXmlFragment = $isXmlFragment;
82
83
        $this->dom = XmlDomLoader::load(
84
                $xml,
85
                new Config(
86
                        ( $isXmlFragment ? self::fragmentDocumentRoot : null ),
87
                        $isHtml,
88
                        LIBXML_NONET | LIBXML_NOBLANKS
89
                )
90
        );
91
92
        $this->elements = new ArrayObject();
93
    }
94
95
    /**
96
     * We replace not printable chars with a placeholder.
97
     * This because DomDocument cannot handle not printable chars
98
     *
99
     * @param $seg
100
     *
101
     * @return string
102
     */
103
    protected function removeNotPrintableChars( $seg ) {
104
105
        preg_match_all( self::regexpAscii, $seg, $matches );
106
107
        if ( !empty( $matches[ 1 ] ) ) {
108
            $test_src = $seg;
109
            foreach ( $matches[ 1 ] as $v ) {
110
                $key      = sprintf( "%02X", ord( $v ) );
111
                $hexNum   = sprintf( "/(\\x{%s})/u", $key );
112
                $test_src = preg_replace( $hexNum, self::$asciiPlaceHoldMap[ $key ][ 'placeHold' ], $test_src, 1 );
113
            }
114
115
            $seg = $test_src;
116
        }
117
118
        preg_match_all( self::regexpEntity, $seg, $matches );
119
120
        if ( !empty( $matches[ 1 ] ) ) {
121
            $test_src = $seg;
122
            foreach ( $matches[ 1 ] as $v ) {
123
                $byte = sprintf( "%02X", hexdec( $v ) );
124
                if ( $byte[ 0 ] == '0' ) {
125
                    $regexp = '/&#x([' . $byte[ 0 ] . ']?' . $byte[ 1 ] . ');/u';
126
                } else {
127
                    $regexp = '/&#x(' . $byte . ');/u';
128
                }
129
130
                $key = sprintf( "%02X", hexdec( $v ) );
131
                if ( array_key_exists( $key, self::$asciiPlaceHoldMap ) ) {
132
                    $test_src = preg_replace( $regexp, self::$asciiPlaceHoldMap[ $key ][ 'placeHold' ], $test_src );
133
                }
134
135
            }
136
137
            $seg = $test_src;
138
        }
139
140
        return $seg;
141
    }
142
143
    /**
144
     * @param DOMNodeList $elementList
145
     * @param ArrayObject $elements
146
     *
147
     * @return ArrayObject
148
     */
149
    protected function mapElements( DOMNodeList $elementList, ArrayObject $elements ) {
150
151
        for ( $i = 0; $i < $elementList->length; $i++ ) {
152
153
            $element = $elementList->item( $i );
154
155
            $elements[] = (object)[
156
                    'node'         => $this->dom->saveXML( $element ),
157
                    'tagName'      => $element->nodeName,
158
                    'attributes'   => $this->getAttributes( $element ),
159
                    'text'         => ( $element instanceof DOMText ) ? $element->textContent : null,
160
                    'self_closed'  => ( $element instanceof DOMText ) ? null : !$element->hasChildNodes(),
161
                    'has_children' => ( $element instanceof DOMText ) ? null : $element->hasChildNodes(),
162
                    'inner_html'   => $element->hasChildNodes() ? $this->mapElements( $element->childNodes, new ArrayObject() ) : new ArrayObject()
163
            ];
164
165
        }
166
167
        return $elements;
168
169
    }
170
171
    /**
172
     * @param DOMNode $element
173
     *
174
     * @return array
175
     */
176
    protected function getAttributes( DOMNode $element ) {
177
178
        if ( !$element->hasAttributes() ) {
179
            return [];
180
        }
181
182
        $attributesMap = [];
183
184
        /**
185
         * @var DOMAttr $attr
186
         */
187
        foreach ( $element->attributes as $attr ) {
188
            $attributesMap[ $attr->nodeName ] = $attr->nodeValue;
189
        }
190
191
        return $attributesMap;
192
193
    }
194
195
    /**
196
     * @return ArrayObject
197
     * @throws DOMException
198
     */
199
    protected function extractNodes() {
200
201
        $htmlNodeList = $this->getNodeListFromQueryPath();
202
203
        if ( !$htmlNodeList instanceof DOMNodeList ) {
0 ignored issues
show
introduced by
$htmlNodeList is always a sub-type of DOMNodeList.
Loading history...
204
            throw new DOMException( 'Bad DOMNodeList' );
205
        }
206
207
        if ( $this->isXmlFragment && $htmlNodeList->item( 0 )->nodeName == self::fragmentDocumentRoot ) {
208
            // there is a fake root node, skip the first element end start with child nodes
209
            $this->mapElements( $htmlNodeList->item( 0 )->childNodes, $this->elements );
210
        } else {
211
            $this->mapElements( $htmlNodeList, $this->elements );
212
        }
213
214
        return $this->elements;
215
216
    }
217
218
    /**
219
     * @return DOMNodeList
220
     */
221
    abstract protected function getNodeListFromQueryPath();
222
223
}