1 | <?php |
||
2 | class FeedParser { |
||
3 | private $doc; |
||
4 | private $error; |
||
5 | private $libxml_errors = array(); |
||
6 | private $items; |
||
7 | private $link; |
||
8 | private $title; |
||
9 | private $type; |
||
10 | private $xpath; |
||
11 | |||
12 | const FEED_RDF = 0; |
||
13 | const FEED_RSS = 1; |
||
14 | const FEED_ATOM = 2; |
||
15 | |||
16 | public function __construct($data) { |
||
17 | libxml_use_internal_errors(true); |
||
18 | libxml_clear_errors(); |
||
19 | $this->doc = new DOMDocument(); |
||
20 | $this->doc->loadXML($data); |
||
21 | |||
22 | mb_substitute_character("none"); |
||
23 | |||
24 | $error = libxml_get_last_error(); |
||
25 | |||
26 | if ($error) { |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
27 | foreach (libxml_get_errors() as $error) { |
||
28 | if ($error->level == LIBXML_ERR_FATAL) { |
||
29 | if (!isset($this->error)) { |
||
30 | //currently only the first error is reported |
||
31 | $this->error = $this->format_error($error); |
||
32 | } |
||
33 | $this->libxml_errors [] = $this->format_error($error); |
||
34 | } |
||
35 | } |
||
36 | } |
||
37 | libxml_clear_errors(); |
||
38 | |||
39 | $this->items = array(); |
||
40 | } |
||
41 | |||
42 | public function init() { |
||
43 | $xpath = new DOMXPath($this->doc); |
||
44 | $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom'); |
||
45 | $xpath->registerNamespace('atom03', 'http://purl.org/atom/ns#'); |
||
46 | $xpath->registerNamespace('media', 'http://search.yahoo.com/mrss/'); |
||
47 | $xpath->registerNamespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'); |
||
48 | $xpath->registerNamespace('slash', 'http://purl.org/rss/1.0/modules/slash/'); |
||
49 | $xpath->registerNamespace('dc', 'http://purl.org/dc/elements/1.1/'); |
||
50 | $xpath->registerNamespace('content', 'http://purl.org/rss/1.0/modules/content/'); |
||
51 | $xpath->registerNamespace('thread', 'http://purl.org/syndication/thread/1.0'); |
||
52 | |||
53 | $this->xpath = $xpath; |
||
54 | |||
55 | $root = $xpath->query("(//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF)"); |
||
56 | |||
57 | if ($root && $root->length > 0) { |
||
58 | $root = $root->item(0); |
||
59 | |||
60 | if ($root) { |
||
61 | switch (mb_strtolower($root->tagName)) { |
||
62 | case "rdf:rdf": |
||
63 | $this->type = $this::FEED_RDF; |
||
64 | break; |
||
65 | case "channel": |
||
66 | $this->type = $this::FEED_RSS; |
||
67 | break; |
||
68 | case "feed": |
||
69 | case "atom:feed": |
||
70 | $this->type = $this::FEED_ATOM; |
||
71 | break; |
||
72 | default: |
||
73 | if (!isset($this->error)) { |
||
74 | $this->error = "Unknown/unsupported feed type"; |
||
75 | } |
||
76 | return; |
||
77 | } |
||
78 | } |
||
79 | |||
80 | switch ($this->type) { |
||
81 | case $this::FEED_ATOM: |
||
82 | |||
83 | $title = $xpath->query("//atom:feed/atom:title")->item(0); |
||
84 | |||
85 | if (!$title) { |
||
86 | $title = $xpath->query("//atom03:feed/atom03:title")->item(0); |
||
87 | } |
||
88 | |||
89 | |||
90 | if ($title) { |
||
91 | $this->title = $title->nodeValue; |
||
92 | } |
||
93 | |||
94 | $link = $xpath->query("//atom:feed/atom:link[not(@rel)]")->item(0); |
||
95 | |||
96 | if (!$link) { |
||
97 | $link = $xpath->query("//atom:feed/atom:link[@rel='alternate']")->item(0); |
||
98 | } |
||
99 | |||
100 | if (!$link) { |
||
101 | $link = $xpath->query("//atom03:feed/atom03:link[not(@rel)]")->item(0); |
||
102 | } |
||
103 | |||
104 | if (!$link) { |
||
105 | $link = $xpath->query("//atom03:feed/atom03:link[@rel='alternate']")->item(0); |
||
106 | } |
||
107 | |||
108 | if ($link && $link->hasAttributes()) { |
||
109 | $this->link = $link->getAttribute("href"); |
||
110 | } |
||
111 | |||
112 | $articles = $xpath->query("//atom:entry"); |
||
113 | |||
114 | if (!$articles || $articles->length == 0) { |
||
115 | $articles = $xpath->query("//atom03:entry"); |
||
116 | } |
||
117 | |||
118 | foreach ($articles as $article) { |
||
119 | array_push($this->items, new FeedItem_Atom($article, $this->doc, $this->xpath)); |
||
120 | } |
||
121 | |||
122 | break; |
||
123 | case $this::FEED_RSS: |
||
124 | $title = $xpath->query("//channel/title")->item(0); |
||
125 | |||
126 | if ($title) { |
||
127 | $this->title = $title->nodeValue; |
||
128 | } |
||
129 | |||
130 | $link = $xpath->query("//channel/link")->item(0); |
||
131 | |||
132 | if ($link) { |
||
133 | if ($link->getAttribute("href")) { |
||
134 | $this->link = $link->getAttribute("href"); |
||
135 | } else if ($link->nodeValue) { |
||
136 | $this->link = $link->nodeValue; |
||
137 | } |
||
138 | } |
||
139 | |||
140 | $articles = $xpath->query("//channel/item"); |
||
141 | |||
142 | foreach ($articles as $article) { |
||
143 | array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath)); |
||
144 | } |
||
145 | |||
146 | break; |
||
147 | case $this::FEED_RDF: |
||
148 | $xpath->registerNamespace('rssfake', 'http://purl.org/rss/1.0/'); |
||
149 | |||
150 | $title = $xpath->query("//rssfake:channel/rssfake:title")->item(0); |
||
151 | |||
152 | if ($title) { |
||
153 | $this->title = $title->nodeValue; |
||
154 | } |
||
155 | |||
156 | $link = $xpath->query("//rssfake:channel/rssfake:link")->item(0); |
||
157 | |||
158 | if ($link) { |
||
159 | $this->link = $link->nodeValue; |
||
160 | } |
||
161 | |||
162 | $articles = $xpath->query("//rssfake:item"); |
||
163 | |||
164 | foreach ($articles as $article) { |
||
165 | array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath)); |
||
166 | } |
||
167 | |||
168 | break; |
||
169 | |||
170 | } |
||
171 | |||
172 | if ($this->title) { |
||
173 | $this->title = trim($this->title); |
||
174 | } |
||
175 | if ($this->link) { |
||
176 | $this->link = trim($this->link); |
||
177 | } |
||
178 | |||
179 | } else { |
||
180 | if (!isset($this->error)) { |
||
181 | $this->error = "Unknown/unsupported feed type"; |
||
182 | } |
||
183 | return; |
||
184 | } |
||
185 | } |
||
186 | |||
187 | public function format_error($error) { |
||
188 | if ($error) { |
||
189 | return sprintf("LibXML error %s at line %d (column %d): %s", |
||
190 | $error->code, $error->line, $error->column, |
||
191 | $error->message); |
||
192 | } else { |
||
193 | return ""; |
||
194 | } |
||
195 | } |
||
196 | |||
197 | // libxml may have invalid unicode data in error messages |
||
198 | public function error() { |
||
199 | return UConverter::transcode($this->error, 'UTF-8', 'UTF-8'); |
||
200 | } |
||
201 | |||
202 | // WARNING: may return invalid unicode data |
||
203 | public function errors() { |
||
204 | return $this->libxml_errors; |
||
205 | } |
||
206 | |||
207 | public function get_link() { |
||
208 | return clean($this->link); |
||
209 | } |
||
210 | |||
211 | public function get_title() { |
||
212 | return clean($this->title); |
||
213 | } |
||
214 | |||
215 | public function get_items() { |
||
216 | return $this->items; |
||
217 | } |
||
218 | |||
219 | public function get_links($rel) { |
||
220 | $rv = array(); |
||
221 | |||
222 | switch ($this->type) { |
||
223 | case $this::FEED_ATOM: |
||
224 | $links = $this->xpath->query("//atom:feed/atom:link"); |
||
225 | |||
226 | foreach ($links as $link) { |
||
227 | if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) { |
||
228 | array_push($rv, clean(trim($link->getAttribute('href')))); |
||
229 | } |
||
230 | } |
||
231 | break; |
||
232 | case $this::FEED_RSS: |
||
233 | $links = $this->xpath->query("//atom:link"); |
||
234 | |||
235 | foreach ($links as $link) { |
||
236 | if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) { |
||
237 | array_push($rv, clean(trim($link->getAttribute('href')))); |
||
238 | } |
||
239 | } |
||
240 | break; |
||
241 | } |
||
242 | |||
243 | return $rv; |
||
244 | } |
||
245 | } |
||
246 |