MagpieRSS::known_encoding()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 1
dl 0
loc 9
rs 9.9666
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Project:     MagpieRSS: a simple RSS integration tool
5
 * File:        rss_parse.inc  - parse an RSS or Atom feed
6
 *               return as a simple object.
7
 *
8
 * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
9
 *
10
 * The lastest version of MagpieRSS can be obtained from:
11
 * http://magpierss.sourceforge.net
12
 *
13
 * For questions, help, comments, discussion, etc., please join the
14
 * Magpie mailing list:
15
 * [email protected]
16
 *
17
 * @author           Kellan Elliott-McCrea <[email protected]>
18
 * @version          0.7a
19
 * @license          GPL
20
 *
21
 */
22
23
define('MAGPIE_DEBUG', 0);
24
25
define('RSS', 'RSS');
26
define('ATOM', 'Atom');
27
28
/**
29
 * Hybrid parser, and object, takes RSS as a string and returns a simple object.
30
 *
31
 * see: rss_fetch.inc for a simpler interface with integrated caching support
32
 *
33
 */
34
class MagpieRSS
35
{
36
    public $parser;
37
38
    public $current_item = [];  // item currently being parsed
39
    public $items        = [];  // collection of parsed items
40
    public $channel      = [];  // hash of channel fields
41
    public $textinput    = [];
42
    public $image        = [];
43
    public $feed_type;
44
    public $feed_version;
45
    public $encoding     = '';       // output encoding of parsed rss
46
47
    public $_source_encoding = '';     // only set if we have to parse xml prolog
48
49
    public $ERROR   = '';
50
    public $WARNING = '';
51
52
    // define some constants
53
54
    public $_CONTENT_CONSTRUCTS = ['content', 'summary', 'info', 'title', 'tagline', 'copyright'];
55
    public $_KNOWN_ENCODINGS    = ['UTF-8', 'US-ASCII', 'ISO-8859-1'];
56
57
    // parser variables, useless if you're not a parser, treat as private
58
    public $stack             = []; // parser stack
59
    public $inchannel         = false;
60
    public $initem            = false;
61
    public $incontent         = false; // if in Atom <content mode="xml"> field
62
    public $intextinput       = false;
63
    public $inimage           = false;
64
    public $current_field     = '';
65
    public $current_namespace = false;
66
67
    /**
68
     *  Set up XML parser, parse source, and return populated RSS object..
69
     *
70
     * @param string $source          string containing the RSS to be parsed
71
     *
72
     *  NOTE:  Probably a good idea to leave the encoding options alone unless
73
     *         you know what you're doing as PHP's character set support is
74
     *         a little weird.
75
     *
76
     *  NOTE:  A lot of this is unnecessary but harmless with PHP5
77
     *
78
     *
79
     * @param string $output_encoding output the parsed RSS in this character
80
     *                                set defaults to ISO-8859-1 as this is PHP's
81
     *                                default.
82
     *
83
     *                                  NOTE: might be changed to UTF-8 in future
84
     *                                  versions.
85
     *
86
     * @param string $input_encoding  the character set of the incoming RSS source.
87
     *                                Leave blank and Magpie will try to figure it
88
     *                                out.
89
     *
90
     *
91
     * @param bool   $detect_encoding if false Magpie won't attempt to detect
92
     *                                source encoding. (caveat emptor)
93
     *
94
     */
95
    public function __construct(
96
        $source,
97
        $output_encoding = 'ISO-8859-1',
98
        $input_encoding = null,
99
        $detect_encoding = true
100
    ) {
101
        # if PHP xml isn't compiled in, die
102
        #
103
        if (!function_exists('xml_parser_create')) {
104
            $this->error("Failed to load PHP's XML Extension. " . 'http://www.php.net/manual/en/ref.xml.php', E_USER_ERROR);
105
        }
106
107
        list($parser, $source) = $this->create_parser($source, $output_encoding, $input_encoding, $detect_encoding);
108
109
        if (!is_resource($parser)) {
110
            $this->error("Failed to create an instance of PHP's XML parser. " . 'http://www.php.net/manual/en/ref.xml.php', E_USER_ERROR);
111
        }
112
113
        $this->parser = $parser;
114
115
        # pass in parser, and a reference to this object
116
        # setup handlers
117
        #
118
        xml_set_object($this->parser, $this);
119
        xml_set_elementHandler($this->parser, 'feed_start_element', 'feed_end_element');
120
121
        xml_set_character_dataHandler($this->parser, 'feed_cdata');
122
123
        $status = @xml_parse($this->parser, $source);
124
125
        if (!$status) {
126
            $errorcode = xml_get_error_code($this->parser);
127
            if (XML_ERROR_NONE != $errorcode) {
128
                $xml_error  = xml_error_string($errorcode);
129
                $error_line = xml_get_current_line_number($this->parser);
130
                $error_col  = xml_get_current_column_number($this->parser);
131
                $errormsg   = "$xml_error at line $error_line, column $error_col";
132
133
                $this->error($errormsg);
134
            }
135
        }
136
137
        xml_parser_free($this->parser);
138
139
        $this->normalize();
140
    }
141
142
    /**
143
     * @param $p
144
     * @param $element
145
     * @param $attrs
146
     */
147
    public function feed_start_element($p, $element, &$attrs)
0 ignored issues
show
Unused Code introduced by
The parameter $p is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
148
    {
149
        $el    = $element = strtolower($element);
150
        $attrs = array_change_key_case($attrs, CASE_LOWER);
151
152
        // check for a namespace, and split if found
153
        $ns = false;
154
        if (strpos($element, ':')) {
155
            list($ns, $el) = explode(':', $element, 2);
156
        }
157
        if ($ns && 'rdf' !== $ns) {
158
            $this->current_namespace = $ns;
159
        }
160
161
        # if feed type isn't set, then this is first element of feed
162
        # identify feed from root element
163
        #
164
        if (!isset($this->feed_type)) {
165
            if ('rdf' === $el) {
166
                $this->feed_type    = RSS;
167
                $this->feed_version = '1.0';
168
            } elseif ('rss' === $el) {
169
                $this->feed_type    = RSS;
170
                $this->feed_version = $attrs['version'];
171
            } elseif ('feed' === $el) {
172
                $this->feed_type    = ATOM;
173
                $this->feed_version = $attrs['version'];
174
                $this->inchannel    = true;
175
            }
176
177
            return;
178
        }
179
180
        if ('channel' === $el) {
181
            $this->inchannel = true;
182
        } elseif ('item' === $el || 'entry' === $el) {
183
            $this->initem = true;
184
            if (isset($attrs['rdf:about'])) {
185
                $this->current_item['about'] = $attrs['rdf:about'];
186
            }
187
        }
188
189
        // if we're in the default namespace of an RSS feed,
190
        //  record textinput or image fields
191 View Code Duplication
        elseif (RSS == $this->feed_type && '' === $this->current_namespace && 'textinput' === $el) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
192
            $this->intextinput = true;
193
        } elseif (RSS == $this->feed_type && '' === $this->current_namespace && 'image' === $el) {
194
            $this->inimage = true;
195
        } # handle atom content constructs
196
        elseif (ATOM == $this->feed_type && in_array($el, $this->_CONTENT_CONSTRUCTS)) {
197
            // avoid clashing w/ RSS mod_content
198
            if ('content' === $el) {
199
                $el = 'atom_content';
200
            }
201
202
            $this->incontent = $el;
203
        } // if inside an Atom content construct (e.g. content or summary) field treat tags as text
204
        elseif (ATOM == $this->feed_type && $this->incontent) {
205
            // if tags are inlined, then flatten
206
            $attrs_str = implode(' ', array_map('map_attrs', array_keys($attrs), array_values($attrs)));
207
208
            $this->append_content("<$element $attrs_str>");
209
210
            array_unshift($this->stack, $el);
211
        }
212
213
        // Atom support many links per containging element.
214
        // Magpie treats link elements of type rel='alternate'
215
        // as being equivalent to RSS's simple link element.
216
        //
217
        elseif (ATOM == $this->feed_type && 'link' === $el) {
218
            if (isset($attrs['rel']) && 'alternate' === $attrs['rel']) {
219
                $link_el = 'link';
220
            } else {
221
                $link_el = 'link_' . $attrs['rel'];
222
            }
223
224
            $this->append($link_el, $attrs['href']);
225
        } // set stack[0] to current element
226
        else {
227
            array_unshift($this->stack, $el);
228
        }
229
    }
230
231
    /**
232
     * @param $p
233
     * @param $text
234
     */
235
    public function feed_cdata($p, $text)
0 ignored issues
show
Unused Code introduced by
The parameter $p is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
236
    {
237
        if (ATOM == $this->feed_type && $this->incontent) {
238
            $this->append_content($text);
239
        } else {
240
            $current_el = implode('_', array_reverse($this->stack));
241
            $this->append($current_el, $text);
242
        }
243
    }
244
245
    /**
246
     * @param $p
247
     * @param $el
248
     */
249
    public function feed_end_element($p, $el)
0 ignored issues
show
Unused Code introduced by
The parameter $p is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
250
    {
251
        $el = strtolower($el);
252
253
        if ('item' === $el || 'entry' === $el) {
254
            $this->items[]      = $this->current_item;
255
            $this->current_item = [];
256
            $this->initem       = false;
257 View Code Duplication
        } elseif (RSS == $this->feed_type && '' === $this->current_namespace && 'textinput' === $el) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
258
            $this->intextinput = false;
259
        } elseif (RSS == $this->feed_type && '' === $this->current_namespace && 'image' === $el) {
260
            $this->inimage = false;
261
        } elseif (ATOM == $this->feed_type && in_array($el, $this->_CONTENT_CONSTRUCTS)) {
262
            $this->incontent = false;
263
        } elseif ('channel' === $el || 'feed' === $el) {
264
            $this->inchannel = false;
265
        } elseif (ATOM == $this->feed_type && $this->incontent) {
266
            // balance tags properly
267
            // note:  i don't think this is actually neccessary
268
            if ($this->stack[0] == $el) {
269
                $this->append_content("</$el>");
270
            } else {
271
                $this->append_content("<$el>");
272
            }
273
274
            array_shift($this->stack);
275
        } else {
276
            array_shift($this->stack);
277
        }
278
279
        $this->current_namespace = false;
280
    }
281
282
    /**
283
     * @param        $str1
284
     * @param string $str2
285
     */
286
    public function concat(&$str1, $str2 = '')
287
    {
288
        if (!isset($str1)) {
289
            $str1 = '';
290
        }
291
        $str1 .= $str2;
292
    }
293
294
    /**
295
     * @param $text
296
     */
297
    public function append_content($text)
298
    {
299
        if ($this->initem) {
300
            $this->concat($this->current_item[$this->incontent], $text);
301
        } elseif ($this->inchannel) {
302
            $this->concat($this->channel[$this->incontent], $text);
303
        }
304
    }
305
306
    // smart append - field and namespace aware
307
308
    /**
309
     * @param $el
310
     * @param $text
311
     */
312
    public function append($el, $text)
313
    {
314
        if (!$el) {
315
            return;
316
        }
317
        if ($this->current_namespace) {
318
            if ($this->initem) {
319
                $this->concat($this->current_item[$this->current_namespace][$el], $text);
320
            } elseif ($this->inchannel) {
321
                $this->concat($this->channel[$this->current_namespace][$el], $text);
322
            } elseif ($this->intextinput) {
323
                $this->concat($this->textinput[$this->current_namespace][$el], $text);
324
            } elseif ($this->inimage) {
325
                $this->concat($this->image[$this->current_namespace][$el], $text);
326
            }
327
        } else {
328
            if ($this->initem) {
329
                $this->concat($this->current_item[$el], $text);
330
            } elseif ($this->intextinput) {
331
                $this->concat($this->textinput[$el], $text);
332
            } elseif ($this->inimage) {
333
                $this->concat($this->image[$el], $text);
334
            } elseif ($this->inchannel) {
335
                $this->concat($this->channel[$el], $text);
336
            }
337
        }
338
    }
339
340
    public function normalize()
341
    {
342
        // if atom populate rss fields
343
        if ($this->is_atom()) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->is_atom() of type string|false is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
344
            $this->channel['description'] = $this->channel['tagline'];
345
            for ($i = 0, $iMax = count($this->items); $i < $iMax; ++$i) {
346
                $item = $this->items[$i];
347
                if (isset($item['summary'])) {
348
                    $item['description'] = $item['summary'];
349
                }
350
                if (isset($item['atom_content'])) {
351
                    $item['content']['encoded'] = $item['atom_content'];
352
                }
353
354
                $atom_date = isset($item['issued']) ? $item['issued'] : @$item['modified'];
355
                if ($atom_date) {
356
                    $epoch = @parse_w3cdtf($atom_date);
357
                    if ($epoch && $epoch > 0) {
358
                        $item['date_timestamp'] = $epoch;
359
                    }
360
                }
361
362
                $this->items[$i] = $item;
363
            }
364
        } elseif ($this->is_rss()) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->is_rss() of type string|false is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
365
            $this->channel['tagline'] = $this->channel['description'];
366
            for ($i = 0, $iMax = count($this->items); $i < $iMax; ++$i) {
367
                $item = $this->items[$i];
368
                if (isset($item['description'])) {
369
                    $item['summary'] = $item['description'];
370
                }
371
                if (isset($item['content']['encoded'])) {
372
                    $item['atom_content'] = $item['content']['encoded'];
373
                }
374
375
                if ('1.0' === $this->is_rss() && isset($item['dc']['date'])) {
376
                    $epoch = @parse_w3cdtf($item['dc']['date']);
377
                    if ($epoch && $epoch > 0) {
378
                        $item['date_timestamp'] = $epoch;
379
                    }
380
                } elseif (isset($item['pubdate'])) {
381
                    $epoch = @strtotime($item['pubdate']);
382
                    if ($epoch > 0) {
383
                        $item['date_timestamp'] = $epoch;
384
                    }
385
                }
386
387
                $this->items[$i] = $item;
388
            }
389
        }
390
    }
391
392
    /**
393
     * @return bool
394
     */
395
    public function is_rss()
396
    {
397
        if (RSS == $this->feed_type) {
398
            return $this->feed_version;
399
        } else {
400
            return false;
401
        }
402
    }
403
404
    /**
405
     * @return bool
406
     */
407
    public function is_atom()
408
    {
409
        if (ATOM == $this->feed_type) {
410
            return $this->feed_version;
411
        } else {
412
            return false;
413
        }
414
    }
415
416
    /**
417
     * return XML parser, and possibly re-encoded source
418
     * @param $source
419
     * @param $out_enc
420
     * @param $in_enc
421
     * @param $detect
422
     * @return array
423
     */
424
    public function create_parser($source, $out_enc, $in_enc, $detect)
425
    {
426
        if (5 == substr(PHP_VERSION, 0, 1)) {
427
            $parser = $this->php5_create_parser($in_enc, $detect);
428
        } else {
429
            list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect);
430
        }
431
        if ($out_enc) {
432
            $this->encoding = $out_enc;
433
            xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc);
434
        }
435
436
        return [$parser, $source];
437
    }
438
439
    /**
440
     * Instantiate an XML parser under PHP5
441
     *
442
     * PHP5 will do a fine job of detecting input encoding
443
     * if passed an empty string as the encoding.
444
     *
445
     * All hail libxml2!
446
     * @param $in_enc
447
     * @param $detect
448
     * @return resource
449
     */
450
    public function php5_create_parser($in_enc, $detect)
451
    {
452
        // by default php5 does a fine job of detecting input encodings
453
        if (!$detect && $in_enc) {
454
            return xml_parser_create($in_enc);
455
        } else {
456
            return xml_parser_create('');
457
        }
458
    }
459
460
    /**
461
     * Instaniate an XML parser under PHP4
462
     *
463
     * Unfortunately PHP4's support for character encodings
464
     * and especially XML and character encodings sucks.  As
465
     * long as the documents you parse only contain characters
466
     * from the ISO-8859-1 character set (a superset of ASCII,
467
     * and a subset of UTF-8) you're fine.  However once you
468
     * step out of that comfy little world things get mad, bad,
469
     * and dangerous to know.
470
     *
471
     * The following code is based on SJM's work with FoF
472
     * @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss
473
     * @param $source
474
     * @param $in_enc
475
     * @param $detect
476
     * @return array
477
     */
478
    public function php4_create_parser($source, $in_enc, $detect)
479
    {
480
        if (!$detect) {
481
            return [xml_parser_create($in_enc), $source];
482
        }
483
484
        if (!$in_enc) {
485
            if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) {
486
                $in_enc                = strtoupper($m[1]);
487
                $this->source_encoding = $in_enc;
0 ignored issues
show
Bug introduced by
The property source_encoding does not seem to exist. Did you mean encoding?

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
488
            } else {
489
                $in_enc = 'UTF-8';
490
            }
491
        }
492
493
        if ($this->known_encoding($in_enc)) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->known_encoding($in_enc) of type string|false is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
494
            return [xml_parser_create($in_enc), $source];
495
        }
496
497
        /*
498
        // the dectected encoding is not one of the simple encodings PHP knows
499
500
        // attempt to use the iconv extension to
501
        // cast the XML to a known encoding
502
        // @see http://php.net/iconv
503
504
        if (function_exists('iconv')) {
505
            $encoded_source = iconv($in_enc,'UTF-8', $source);
506
            if ($encoded_source) {
507
                return array(xml_parser_create('UTF-8'), $encoded_source);
508
            }
509
        }
510
511
        // iconv didn't work, try mb_convert_encoding
512
        // @see http://php.net/mbstring
513
        if (function_exists('mb_convert_encoding')) {
514
            $encoded_source = iconv($source, 'UTF-8', $in_enc );
515
            if ($encoded_source) {
516
                return array(xml_parser_create('UTF-8'), $encoded_source);
517
            }
518
        }
519
520
        // else
521
        $this->error("Feed is in an unsupported character encoding. ($in_enc) " .
522
                     "You may see strange artifacts, and mangled characters.",
523
                     E_USER_NOTICE);
524
        */
525
526
        return [xml_parser_create(), $source];
527
    }
528
529
    /**
530
     * @param $enc
531
     * @return bool|string
532
     */
533
    public function known_encoding($enc)
534
    {
535
        $enc = strtoupper($enc);
536
        if (in_array($enc, $this->_KNOWN_ENCODINGS)) {
537
            return $enc;
538
        } else {
539
            return false;
540
        }
541
    }
542
543
    /**
544
     * @param     $errormsg
545
     * @param int $lvl
546
     */
547
    public function error($errormsg, $lvl = E_USER_WARNING)
548
    {
549
        // append PHP's error message if track_errors enabled
550
        if (!empty($php_errormsg)) {
551
            $errormsg .= " ($php_errormsg)";
552
        }
553
        if (MAGPIE_DEBUG) {
554
            trigger_error($errormsg, $lvl);
555
        } else {
556
            error_log($errormsg, 0);
557
        }
558
559
        $notices = E_USER_NOTICE | E_NOTICE;
560
        if ($lvl & $notices) {
561
            $this->WARNING = $errormsg;
562
        } else {
563
            $this->ERROR = $errormsg;
564
        }
565
    }
566
} // end class RSS
567
568
/**
569
 * @param $k
570
 * @param $v
571
 * @return string
572
 */
573
function map_attrs($k, $v)
574
{
575
    return "$k=\"$v\"";
576
}
577
578
/**
579
 * @param $date_str
580
 * @return int
581
 */
582
function parse_w3cdtf($date_str)
583
{
584
    # regex to match wc3dtf
585
    $pat = "/(\d{4})-(\d{2})-(\d{2})[T]?(\d{2})?[:]?(\d{2})?(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/";
586
587
    if (preg_match($pat, $date_str, $match)) {
588
        list($year, $month, $day, $hours, $minutes, $seconds) = [
589
            $match[1],
590
            $match[2],
591
            $match[3],
592
            $match[4],
593
            $match[5],
594
            $match[6]
595
        ];
596
597
        # calc epoch for current date assuming GMT
598
        $epoch = gmmktime((int)$hours, (int)$minutes, (int)$seconds, (int)$month, (int)$day, (int)$year);
599
600
        $offset = 0;
601
        if ('Z' === $match[10]) {
602
            # zulu time, aka GMT
603
        } else {
604
            list($tz_mod, $tz_hour, $tz_min) = [$match[8], $match[9], $match[10]];
605
606
            # zero out the variables
607
            if (!$tz_hour) {
608
                $tz_hour = 0;
609
            }
610
            if (!$tz_min) {
611
                $tz_min = 0;
612
            }
613
614
            $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
615
616
            # is timezone ahead of GMT?  then subtract offset
617
            #
618
            if ('+' == $tz_mod) {
619
                $offset_secs = $offset_secs * -1;
620
            }
621
622
            $offset = $offset_secs;
623
        }
624
        $epoch = $epoch + $offset;
625
626
        return $epoch;
627
    } else {
628
        return -1;
629
    }
630
}
631