Completed
Pull Request — 1.10.x (#1154)
by
unknown
45:16
created

MagpieRSS::normalize()   D

Complexity

Conditions 19
Paths 47

Size

Total Lines 48
Code Lines 32

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 19
eloc 32
nc 47
nop 0
dl 0
loc 48
rs 4.8943

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Project:     MagpieRSS: a simple RSS integration tool
4
 * File:        rss_parse.inc  - parse an RSS or Atom feed
5
 *               return as a simple object.
6
 *
7
 * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
8
 *
9
 * The lastest version of MagpieRSS can be obtained from:
10
 * http://magpierss.sourceforge.net
11
 *
12
 * For questions, help, comments, discussion, etc., please join the
13
 * Magpie mailing list:
14
 * [email protected]
15
 *
16
 * @author           Kellan Elliott-McCrea <[email protected]>
17
 * @version          0.7a
18
 * @license          GPL
19
 * @package chamilo.include.rss
20
 */
21
/**
22
 * Code
23
 */
24
define('RSS', 'RSS');
25
define('ATOM', 'Atom');
26
27
require_once (MAGPIE_DIR . 'rss_utils.inc');
28
29
/**
30
* Hybrid parser, and object, takes RSS as a string and returns a simple object.
31
*
32
* see: rss_fetch.inc for a simpler interface with integrated caching support
33
*
34
 * @package chamilo.include.rss
35
*/
36
class MagpieRSS {
37
    public $parser;
38
    
39
    public $current_item   = array();  // item currently being parsed
40
    public $items          = array();  // collection of parsed items
41
    public $channel        = array();  // hash of channel fields
42
    public $textinput      = array();
43
    public $image          = array();
44
    public $feed_type;
45
    public $feed_version;
46
    public $encoding       = '';       // output encoding of parsed rss
47
    
48
    private $_source_encoding = '';     // only set if we have to parse xml prolog
49
    
50
    public $ERROR = "";
51
    public $WARNING = "";
52
    
53
    // define some constants
54
    
55
    private $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
56
    private $_KNOWN_ENCODINGS    = array('UTF-8', 'US-ASCII', 'ISO-8859-1');
57
58
    // parser variables, useless if you're not a parser, treat as private
59
    public $stack              = array(); // parser stack
60
    public $inchannel          = false;
61
    public $initem             = false;
62
    public $incontent          = false; // if in Atom <content mode="xml"> field 
63
    public $intextinput        = false;
64
    public $inimage            = false;
65
    public $current_namespace  = false;
66
    
67
68
    /**
69
     *  Set up XML parser, parse source, and return populated RSS object..
70
     *   
71
     *  @param string $source           string containing the RSS to be parsed
72
     *
73
     *  NOTE:  Probably a good idea to leave the encoding options alone unless
74
     *         you know what you're doing as PHP's character set support is
75
     *         a little weird.
76
     *
77
     *  NOTE:  A lot of this is unnecessary but harmless with PHP5 
78
     *
79
     *
80
     *  @param string $output_encoding  output the parsed RSS in this character 
81
     *                                  set defaults to ISO-8859-1 as this is PHP's
82
     *                                  default.
83
     *
84
     *                                  NOTE: might be changed to UTF-8 in future
85
     *                                  versions.
86
     *                               
87
     *  @param string $input_encoding   the character set of the incoming RSS source. 
88
     *                                  Leave blank and Magpie will try to figure it
89
     *                                  out.
90
     *                                  
91
     *                                   
92
     *  @param bool   $detect_encoding  if false Magpie won't attempt to detect
93
     *                                  source encoding. (caveat emptor)
94
     *
95
     */
96
    public function MagpieRSS ($source, $output_encoding='ISO-8859-1', 
97
                        $input_encoding=null, $detect_encoding=true) 
98
    {   
99
        # if PHP xml isn't compiled in, die
100
        #
101
        if (!function_exists('xml_parser_create')) {
102
            $this->error( "Failed to load PHP's XML Extension. " . 
103
                          "http://www.php.net/manual/en/ref.xml.php",
104
                           E_USER_ERROR );
105
        }
106
        
107
        list($parser, $source) = $this->create_parser($source, 
108
                $output_encoding, $input_encoding, $detect_encoding);
109
        
110
        
111
        if (!is_resource($parser)) {
112
            $this->error( "Failed to create an instance of PHP's XML parser. " .
113
                          "http://www.php.net/manual/en/ref.xml.php",
114
                          E_USER_ERROR );
115
        }
116
117
        
118
        $this->parser = $parser;
119
        
120
        # pass in parser, and a reference to this object
121
        # setup handlers
122
        #
123
        xml_set_object( $this->parser, $this );
124
        xml_set_element_handler($this->parser, 
125
                'feed_start_element', 'feed_end_element' );
126
                        
127
        xml_set_character_data_handler( $this->parser, 'feed_cdata' ); 
128
    
129
        $status = xml_parse( $this->parser, $source );
130
        
131
        if (! $status ) {
132
            $errorcode = xml_get_error_code( $this->parser );
133
            if ( $errorcode != XML_ERROR_NONE ) {
134
                $xml_error = xml_error_string( $errorcode );
135
                $error_line = xml_get_current_line_number($this->parser);
136
                $error_col = xml_get_current_column_number($this->parser);
137
                $errormsg = "$xml_error at line $error_line, column $error_col";
138
139
                $this->error( $errormsg );
140
            }
141
        }
142
        
143
        xml_parser_free( $this->parser );
144
145
        $this->normalize();
146
    }
147
    
148
    public function feed_start_element($p, $element, &$attrs) {
149
        $el = $element = strtolower($element);
150
        $attrs = array_change_key_case($attrs, CASE_LOWER);
151
        
152
        // check for a namespace, and split if found
153
        $ns = false;
154
        if ( strpos( $element, ':' ) ) {
155
            list($ns, $el) = split( ':', $element, 2); 
156
        }
157
        if ( $ns and $ns != 'rdf' ) {
158
            $this->current_namespace = $ns;
159
        }
160
            
161
        # if feed type isn't set, then this is first element of feed
162
        # identify feed from root element
163
        #
164
        if (!isset($this->feed_type) ) {
165
            if ( $el == 'rdf' ) {
166
                $this->feed_type = RSS;
167
                $this->feed_version = '1.0';
168
            }
169
            elseif ( $el == 'rss' ) {
170
                $this->feed_type = RSS;
171
                $this->feed_version = $attrs['version'];
172
            }
173
            elseif ( $el == 'feed' ) {
174
                $this->feed_type = ATOM;
175
                $this->feed_version = $attrs['version'];
176
                $this->inchannel = true;
177
            }
178
            return;
179
        }
180
    
181
        if ( $el == 'channel' ) 
182
        {
183
            $this->inchannel = true;
184
        }
185
        elseif ($el == 'item' or $el == 'entry' ) 
186
        {
187
            $this->initem = true;
188
            if ( isset($attrs['rdf:about']) ) {
189
                $this->current_item['about'] = $attrs['rdf:about']; 
190
            }
191
        }
192
        
193
        // if we're in the default namespace of an RSS feed,
194
        //  record textinput or image fields
195 View Code Duplication
        elseif ( 
196
            $this->feed_type == RSS and 
197
            $this->current_namespace == '' and 
198
            $el == 'textinput' ) 
199
        {
200
            $this->intextinput = true;
201
        }
202
        
203 View Code Duplication
        elseif (
204
            $this->feed_type == RSS and 
205
            $this->current_namespace == '' and 
206
            $el == 'image' ) 
207
        {
208
            $this->inimage = true;
209
        }
210
        
211
        # handle atom content constructs
212
        elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
213
        {
214
            // avoid clashing w/ RSS mod_content
215
            if ($el == 'content' ) {
216
                $el = 'atom_content';
217
            }
218
            
219
            $this->incontent = $el;
220
            
221
            
222
        }
223
        
224
        // if inside an Atom content construct (e.g. content or summary) field treat tags as text
225
        elseif ($this->feed_type == ATOM and $this->incontent ) 
226
        {
227
            // if tags are inlined, then flatten
228
            $attrs_str = join(' ', 
229
                    array_map('map_attrs', 
230
                    array_keys($attrs), 
231
                    array_values($attrs) ) );
232
            
233
            $this->append_content( "<$element $attrs_str>"  );
234
                    
235
            array_unshift( $this->stack, $el );
236
        }
237
        
238
        // Atom support many links per containging element.
239
        // Magpie treats link elements of type rel='alternate'
240
        // as being equivalent to RSS's simple link element.
241
        //
242
        elseif ($this->feed_type == ATOM and $el == 'link' ) 
243
        {
244
            if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' ) 
245
            {
246
                $link_el = 'link';
247
            }
248
            else {
249
                $link_el = 'link_' . $attrs['rel'];
250
            }
251
            
252
            $this->append($link_el, $attrs['href']);
253
        }
254
        // set stack[0] to current element
255
        else {
256
            array_unshift($this->stack, $el);
257
        }
258
    }
259
    
260
261
    
262
    public function feed_cdata ($p, $text) {
263
        if ($this->feed_type == ATOM and $this->incontent) 
264
        {
265
            $this->append_content( $text );
266
        }
267
        else {
268
            $current_el = join('_', array_reverse($this->stack));
269
            $this->append($current_el, $text);
270
        }
271
    }
272
    
273
    public function feed_end_element ($p, $el) {
274
        $el = strtolower($el);
275
        
276
        if ( $el == 'item' or $el == 'entry' ) 
277
        {
278
            $this->items[] = $this->current_item;
279
            $this->current_item = array();
280
            $this->initem = false;
281
        }
282 View Code Duplication
        elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' ) 
283
        {
284
            $this->intextinput = false;
285
        }
286 View Code Duplication
        elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' ) 
287
        {
288
            $this->inimage = false;
289
        }
290
        elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
291
        {   
292
            $this->incontent = false;
293
        }
294
        elseif ($el == 'channel' or $el == 'feed' ) 
295
        {
296
            $this->inchannel = false;
297
        }
298
        elseif ($this->feed_type == ATOM and $this->incontent  ) {
299
            // balance tags properly
300
            // note:  i don't think this is actually neccessary
301
            if ( $this->stack[0] == $el ) 
302
            {
303
                $this->append_content("</$el>");
304
            }
305
            else {
306
                $this->append_content("<$el />");
307
            }
308
309
            array_shift( $this->stack );
310
        }
311
        else {
312
            array_shift( $this->stack );
313
        }
314
        
315
        $this->current_namespace = false;
316
    }
317
    
318
    public function concat (&$str1, $str2="") {
319
        if (!isset($str1) ) {
320
            $str1="";
321
        }
322
        $str1 .= $str2;
323
    }
324
    
325
    
326
    
327
    public function append_content($text) {
328
        if ( $this->initem ) {
329
            $this->concat( $this->current_item[ $this->incontent ], $text );
330
        }
331
        elseif ( $this->inchannel ) {
332
            $this->concat( $this->channel[ $this->incontent ], $text );
333
        }
334
    }
335
    
336
    // smart append - field and namespace aware
337
    public function append($el, $text) {
338
        if (!$el) {
339
            return;
340
        }
341
        if ( $this->current_namespace ) 
342
        {
343
            if ( $this->initem ) {
344
                $this->concat(
345
                    $this->current_item[ $this->current_namespace ][ $el ], $text);
346
            }
347
            elseif ($this->inchannel) {
348
                $this->concat(
349
                    $this->channel[ $this->current_namespace][ $el ], $text );
350
            }
351
            elseif ($this->intextinput) {
352
                $this->concat(
353
                    $this->textinput[ $this->current_namespace][ $el ], $text );
354
            }
355
            elseif ($this->inimage) {
356
                $this->concat(
357
                    $this->image[ $this->current_namespace ][ $el ], $text );
358
            }
359
        }
360
        else {
361
            if ( $this->initem ) {
362
                $this->concat(
363
                    $this->current_item[ $el ], $text);
364
            }
365
            elseif ($this->intextinput) {
366
                $this->concat(
367
                    $this->textinput[ $el ], $text );
368
            }
369
            elseif ($this->inimage) {
370
                $this->concat(
371
                    $this->image[ $el ], $text );
372
            }
373
            elseif ($this->inchannel) {
374
                $this->concat(
375
                    $this->channel[ $el ], $text );
376
            }
377
            
378
        }
379
    }
380
    
381
    public function normalize () {
382
        // if atom populate rss fields
383
        if ( $this->is_atom() ) {
384
            $this->channel['description'] = $this->channel['tagline'];
385
            for ( $i = 0; $i < count($this->items); $i++) {
386
                $item = $this->items[$i];
387
                if ( isset($item['summary']) )
388
                    $item['description'] = $item['summary'];
389
                if ( isset($item['atom_content']))
390
                    $item['content']['encoded'] = $item['atom_content'];
391
                
392
                $atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified'];
393
                if ( $atom_date ) {
394
                    $epoch = @parse_w3cdtf($atom_date);
395
                    if ($epoch and $epoch > 0) {
396
                        $item['date_timestamp'] = $epoch;
397
                    }
398
                }
399
                
400
                $this->items[$i] = $item;
401
            }       
402
        }
403
        elseif ( $this->is_rss() ) {
404
            $this->channel['tagline'] = $this->channel['description'];
405
            for ( $i = 0; $i < count($this->items); $i++) {
406
                $item = $this->items[$i];
407
                if ( isset($item['description']))
408
                    $item['summary'] = $item['description'];
409
                if ( isset($item['content']['encoded'] ) )
410
                    $item['atom_content'] = $item['content']['encoded'];
411
                
412
                if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) {
413
                    $epoch = @parse_w3cdtf($item['dc']['date']);
414
                    if ($epoch and $epoch > 0) {
415
                        $item['date_timestamp'] = $epoch;
416
                    }
417
                }
418
                elseif ( isset($item['pubdate']) ) {
419
                    $epoch = @strtotime($item['pubdate']);
420
                    if ($epoch > 0) {
421
                        $item['date_timestamp'] = $epoch;
422
                    }
423
                }
424
                
425
                $this->items[$i] = $item;
426
            }
427
        }
428
    }
429
    
430
    
431
    public function is_rss () {
432
        if ( $this->feed_type == RSS ) {
433
            return $this->feed_version; 
434
        }
435
        else {
436
            return false;
437
        }
438
    }
439
    
440
    public function is_atom() {
441
        if ( $this->feed_type == ATOM ) {
442
            return $this->feed_version;
443
        }
444
        else {
445
            return false;
446
        }
447
    }
448
449
    /**
450
    * return XML parser, and possibly re-encoded source
451
    *
452
    */
453
    public function create_parser($source, $out_enc, $in_enc, $detect) {
454
        if ( substr(phpversion(),0,1) == 5) {
455
            $parser = $this->php5_create_parser($in_enc, $detect);
456
        }
457
        else {
458
            list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect);
459
        }
460
        if ($out_enc) {
461
            $this->encoding = $out_enc;
462
            xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc);
463
        }
464
        
465
        return array($parser, $source);
466
    }
467
    
468
    /**
469
    * Instantiate an XML parser under PHP5
470
    *
471
    * PHP5 will do a fine job of detecting input encoding
472
    * if passed an empty string as the encoding. 
473
    *
474
    * All hail libxml2!
475
    *
476
    */
477
    public function php5_create_parser($in_enc, $detect) {
478
        // by default php5 does a fine job of detecting input encodings
479
        if(!$detect && $in_enc) {
480
            return xml_parser_create($in_enc);
481
        }
482
        else {
483
            return xml_parser_create('');
484
        }
485
    }
486
    
487
    /**
488
    * Instaniate an XML parser under PHP4
489
    *
490
    * Unfortunately PHP4's support for character encodings
491
    * and especially XML and character encodings sucks.  As
492
    * long as the documents you parse only contain characters
493
    * from the ISO-8859-1 character set (a superset of ASCII,
494
    * and a subset of UTF-8) you're fine.  However once you
495
    * step out of that comfy little world things get mad, bad,
496
    * and dangerous to know.
497
    *
498
    * The following code is based on SJM's work with FoF
499
    * @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss
500
    *
501
    */
502
    public function php4_create_parser($source, $in_enc, $detect) {
503
        if ( !$detect ) {
504
            return array(xml_parser_create($in_enc), $source);
505
        }
506
        
507
        if (!$in_enc) {
508
            if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) {
509
                $in_enc = strtoupper($m[1]);
510
                $this->source_encoding = $in_enc;
511
            }
512
            else {
513
                $in_enc = 'UTF-8';
514
            }
515
        }
516
        
517
        if ($this->known_encoding($in_enc)) {
518
            return array(xml_parser_create($in_enc), $source);
519
        }
520
        
521
        // the dectected encoding is not one of the simple encodings PHP knows
522
        
523
        // attempt to use the iconv extension to
524
        // cast the XML to a known encoding
525
        // @see http://php.net/iconv
526
       
527 View Code Duplication
        if (function_exists('iconv'))  {
528
            $encoded_source = iconv($in_enc,'UTF-8', $source);
529
            if ($encoded_source) {
530
                return array(xml_parser_create('UTF-8'), $encoded_source);
531
            }
532
        }
533
        
534
        // iconv didn't work, try mb_convert_encoding
535
        // @see http://php.net/mbstring
536 View Code Duplication
        if(function_exists('mb_convert_encoding')) {
537
            $encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc );
538
            if ($encoded_source) {
539
                return array(xml_parser_create('UTF-8'), $encoded_source);
540
            }
541
        }
542
        
543
        // else 
544
        $this->error("Feed is in an unsupported character encoding. ($in_enc) " .
545
                     "You may see strange artifacts, and mangled characters.",
546
                     E_USER_NOTICE);
547
            
548
        return array(xml_parser_create(), $source);
549
    }
550
    
551
    public function known_encoding($enc) {
552
        $enc = strtoupper($enc);
553
        if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) {
554
            return $enc;
555
        }
556
        else {
557
            return false;
558
        }
559
    }
560
561
    public function error ($errormsg, $lvl=E_USER_WARNING) {
562
        // append PHP's error message if track_errors enabled
563
        if ( isset($php_errormsg) ) { 
564
            $errormsg .= " ($php_errormsg)";
565
        }
566
        if ( MAGPIE_DEBUG ) {
567
            trigger_error( $errormsg, $lvl);        
568
        }
569
        else {
570
            error_log( $errormsg, 0);
571
        }
572
        
573
        $notices = E_USER_NOTICE|E_NOTICE;
574
        if ( $lvl&$notices ) {
575
            $this->WARNING = $errormsg;
576
        } else {
577
            $this->ERROR = $errormsg;
578
        }
579
    }
580
    
581
    
582
} // end class RSS
583
584
function map_attrs($k, $v) {
585
    return "$k=\"$v\"";
586
}
587
588
// patch to support medieval versions of PHP4.1.x, 
589
// courtesy, Ryan Currie, [email protected]
590
591
if (!function_exists('array_change_key_case')) {
592
	define("CASE_UPPER",1);
593
	define("CASE_LOWER",0);
594
595
596
	function array_change_key_case($array,$case=CASE_LOWER) {
597
       if ($case=CASE_LOWER) $cmd=strtolower;
598
       elseif ($case=CASE_UPPER) $cmd=strtoupper;
599
       foreach($array as $key=>$value) {
600
               $output[$cmd($key)]=$value;
601
       }
602
       return $output;
603
	}
604
605
}
606