Completed
Push — master ( f4efa7...b29027 )
by Ryan
05:09 queued 01:59
created

Xml::findNextRealNode()   A

Complexity

Conditions 4
Paths 2

Size

Total Lines 9
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
nc 2
nop 1
dl 0
loc 9
c 0
b 0
f 0
cc 4
rs 9.2
1
<?php
2
/**
3
 * Copyright (c) 2017–2018 Ryan Parman <http://ryanparman.com>.
4
 * Copyright (c) 2017–2018 Contributors.
5
 *
6
 * http://opensource.org/licenses/Apache2.0
7
 */
8
9
declare(strict_types=1);
10
11
namespace SimplePie\Parser;
12
13
use DOMComment;
14
use DOMDocument;
15
use DOMNode;
16
use DOMText;
17
use DOMXPath;
18
use Psr\Http\Message\StreamInterface;
19
use Psr\Log\LoggerInterface;
20
use SimplePie\Enum as E;
21
use SimplePie\HandlerStackInterface;
22
use SimplePie\Mixin as Tr;
23
use SimplePie\SimplePie;
24
use SimplePie\Type\Feed;
25
use SimplePie\Util\Ns;
26
27
/**
28
 * The core parser for all XML content.
29
 */
30
class Xml extends AbstractParser
31
{
32
    use Tr\DomDocumentTrait;
33
    use Tr\LoggerTrait;
34
    use Tr\RawDocumentTrait;
35
36
    /**
37
     * The object which contains the parsed results.
38
     *
39
     * @var Feed
40
     */
41
    protected $feed;
42
43
    /**
44
     * Bitwise libxml options to use for parsing XML.
45
     *
46
     * @var int
47
     */
48
    protected $libxml;
49
50
    /**
51
     * The handler stack which contains registered middleware.
52
     *
53
     * @var HandlerStackInterface
54
     */
55
    protected $middleware;
56
57
    /**
58
     * The XML namespace handler.
59
     *
60
     * @var Ns
61
     */
62
    protected $ns;
63
64
    /**
65
     * Constructs a new instance of this class.
66
     *
67
     * @param StreamInterface       $stream                  A PSR-7 `StreamInterface` which is typically returned by
68
     *                                                       the `getBody()` method of a `ResponseInterface` class.
69
     * @param LoggerInterface       $logger                  The PSR-3 logger.
70
     * @param HandlerStackInterface $handlerStack            The handler stack which contains registered middleware.
71
     * @param int                   $libxml                  The libxml value to use for parsing XML.
72
     * @param bool                  $handleHtmlEntitiesInXml Whether or not SimplePie should pre-parse the XML as HTML
73
     *                                                       to resolve the entities. A value of `true` means that
74
     *                                                       SimplePie should inject the entity definitions. A value of
75
     *                                                       `false` means that SimplePie should NOT inject the entity
76
     *                                                       definitions. The default value is `false`.
77
     *
78
     * @throws Error
79
     * @throws TypeError
80
     *
81
     * @phpcs:disable Generic.Functions.OpeningFunctionBraceBsdAllman.BraceOnSameLine
82
     */
83
    public function __construct(
84
        StreamInterface $stream,
85
        LoggerInterface $logger,
86
        HandlerStackInterface $handlerStack,
87
        int $libxml,
88
        bool $handleHtmlEntitiesInXml
89
    ) {
90
        // @phpcs:enable
91
92
        // Logger
93
        $this->logger = $logger;
94
95
        // Middleware
96
        $this->middleware = $handlerStack;
97
98
        // Libxml2
99
        $this->libxml = $libxml;
100
101
        // Raw stream
102
        $this->rawDocument = $this->readStream($stream);
103
104
        // DOMDocument
105
        $this->domDocument = new DOMDocument('1.0', 'utf-8');
106
107
        // Don't barf errors all over the output
108
        \libxml_use_internal_errors(true);
109
110
        // DOMDocument configuration
111
        $this->domDocument->recover             = true;
112
        $this->domDocument->formatOutput        = false;
113
        $this->domDocument->preserveWhiteSpace  = false;
114
        $this->domDocument->resolveExternals    = true;
115
        $this->domDocument->substituteEntities  = true;
116
        $this->domDocument->strictErrorChecking = false;
117
        $this->domDocument->validateOnParse     = false;
118
119
        // If enabled, force-inject the contents of `entities.dtd` into the feed.
120
        if ($handleHtmlEntitiesInXml) {
121
            $this->getLogger()->debug('Enabled handing HTML entities in XML.');
122
            $this->domDocument->loadXML($this->rawDocument, $this->libxml);
123
124
            // Make sure this is an XML element instead of a comment or text.
125
            $firstElement = $this->findNextRealNode($this->domDocument->firstChild);
126
127
            // <feed, <rss, etc.
128
            $rootElementStart = \sprintf('<%s', (string) $firstElement->nodeName);
129
130
            // Read the entity definition file, and force-inject it into the XML document
131
            $this->rawDocument = \str_replace(
132
                $rootElementStart,
133
                \sprintf(
134
                    '%s%s',
135
                    \trim(
136
                        \file_get_contents(\dirname(SIMPLEPIE_ROOT) . '/resources/entities.dtd')
137
                    ),
138
                    $rootElementStart
139
                ),
140
                $this->rawDocument
141
            );
142
        }
143
144
        // Parse the XML document with the configured libxml options
145
        $this->domDocument->loadXML($this->rawDocument, $this->libxml);
146
147
        // Register the namespace handler.
148
        $this->ns = (new Ns($this->domDocument))
149
            ->setLogger($this->getLogger());
150
151
        // Look at which namespaces the registered middleware understands.
152
        $this->middleware->registerNamespaces($this->ns);
153
154
        // Instantiate a new write-to feed object.
155
        $this->feed = (new Feed($this->getNamespaceAlias() ?? ''))
156
            ->setLogger($this->getLogger());
157
158
        // Invoke the registered middleware.
159
        $this->middleware->invoke(
160
            E\FeedType::XML,
161
            $this->getFeed()->getRoot(),
162
            $this->getNamespaceAlias(),
163
            $this->xpath()
164
        );
165
166
        // Clear the libxml errors to avoid excessive memory usage
167
        \libxml_clear_errors();
168
    }
169
170
    /**
171
     * Get the XML namespace handler.
172
     *
173
     * @return Ns
174
     */
175
    public function getNs(): Ns
176
    {
177
        return $this->ns;
178
    }
179
180
    /**
181
     * Get the preferred namespace alias.
182
     *
183
     * @return string|null
184
     */
185
    public function getNamespaceAlias(): ?string
186
    {
187
        $namespace = $this->getNs();
188
189
        return $namespace->getPreferredNamespaceAlias(
190
            $this->domDocument->documentElement->namespaceURI
191
        );
192
    }
193
194
    /**
195
     * Gets a reference to the `DOMXPath` object, with the default namespace
196
     * already registered.
197
     *
198
     * @return DOMXPath
199
     */
200
    public function xpath()
201
    {
202
        $ns    = $this->getNamespaceAlias();
203
        $xpath = new DOMXPath($this->domDocument);
204
205
        // Register the namespace alias with the XPath instance
206
        if (null !== $ns) {
207
            $xpath->registerNamespace(
208
                $ns,
209
                $this->domDocument->documentElement->namespaceURI ?? ''
210
            );
211
        }
212
213
        return $xpath;
214
    }
215
216
    /**
217
     * Some DOMNode names are `#comment` or `#text`. This method will move the
218
     * pointer to the next node, then the next until it finds a real XML node.
219
     *
220
     * @param DOMNode $node The `DOMNode` element to evaluate.
221
     *
222
     * @return DOMNode
223
     */
224
    public function findNextRealNode(DOMNode $node): DOMNode
225
    {
226
        $n = $node;
227
228
        while (($n instanceof DOMComment || $n instanceof DOMText) && null !== $n) {
229
            $n = $n->nextSibling;
230
        }
231
232
        return $n;
233
    }
234
235
    /**
236
     * Retrieves the object which represents the top-level feed.
237
     *
238
     * @return Feed
239
     */
240
    public function getFeed(): Feed
241
    {
242
        return $this->feed;
243
    }
244
}
245