Completed
Push — master ( 7668ba...b7b643 )
by Andreas
08:40
created

net_nemein_rss_fetch::get_parser()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 19
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 10
nc 2
nop 0
dl 0
loc 19
rs 9.4285
c 0
b 0
f 0
1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
/**
10
 * RSS and Atom feed fetching class. Caches the fetched items as articles
11
 * in net.nehmer.blog or events in net.nemein.calendar
12
 *
13
 * @package net.nemein.rss
14
 */
15
class net_nemein_rss_fetch extends midcom_baseclasses_components_purecode
1 ignored issue
show
Coding Style Compatibility introduced by
PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions.

You can fix this by adding a namespace to your class:

namespace YourVendor;

class YourClass { }

When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.

Loading history...
16
{
17
    /**
18
     * The last error reported by SimplePie, if any
19
     */
20
    public $lasterror;
21
22
    /**
23
     * The feed object we're fetching
24
     */
25
    private $_feed;
26
27
    /**
28
     * Property of midcom_db_article we're using for storing the feed item GUIDs
29
     */
30
    private $_guid_property = 'extra2';
31
32
    /**
33
     * Current node we're importing to
34
     *
35
     * @var midcom_db_topic
36
     */
37
    private $_node = null;
38
39
    /**
40
     * Initializes the class with a given feed
41
     */
42
    public function __construct(net_nemein_rss_feed_dba $feed)
43
    {
44
        $this->_feed = $feed;
45
46
        $this->_node = new midcom_db_topic($this->_feed->node);
47
48
        parent::__construct();
49
    }
50
51
    /**
52
     * @return SimplePie
53
     */
54
    public static function get_parser()
55
    {
56
        $parser = new SimplePie;
57
        $parser->get_registry()->register('Item', 'net_nemein_rss_parser_item');
58
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
59
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
60
        $parser->enable_cache(false); //enabling cache leads to segfaults for some reason
61
        if (version_compare(PHP_VERSION, '5.4', '>='))
62
        {
63
            /**
64
             * Keep parser instances around until shutdown,
65
             * if they are deleted before, this triggers a segfault under PHP 5.4
66
             * @see https://github.com/simplepie/simplepie/issues/284
67
             */
68
            static $parsers = array();
69
            $parsers[] = $parser;
70
        }
71
        return $parser;
72
    }
73
74
    /**
75
     * Actually fetch a feed
76
     *
77
     * @param string $url The URL to fetch
78
     * @return SimplePie
79
     */
80
    public static function raw_fetch($url)
81
    {
82
        $parser = self::get_parser();
83
        $parser->set_feed_url($url);
84
        $parser->init();
85
        return $parser;
86
    }
87
88
    /**
89
     * Fetch given RSS or Atom feed
90
     *
91
     * @param array Array of normalized feed items
92
     */
93
    function fetch()
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
94
    {
95
        $parser = self::raw_fetch($this->_feed->url);
96
        if ($parser->error())
97
        {
98
            $this->lasterror = $parser->error();
99
            return array();
100
        }
101
        if (!empty($parser->data['headers']['etag']))
102
        {
103
            // Etag checking
104
            $etag = trim($parser->data['headers']['etag']);
105
106
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
107
            if (   !empty($feed_etag)
108
                && $feed_etag == $etag)
109
            {
110
                // Feed hasn't changed, skip updating
111
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
112
                return array();
113
            }
114
115
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
116
        }
117
118
        $this->_feed->latestfetch = time();
119
        $this->_feed->_use_activitystream = false;
120
        $this->_feed->_use_rcs = false;
121
        $this->_feed->update();
122
123
        return $parser->get_items();
124
    }
125
126
    /**
127
     * Fetches and imports items in the feed
128
     */
129
    public function import()
130
    {
131
        if (!$this->_node->component)
132
        {
133
            return array();
134
        }
135
136
        $items = $this->fetch();
137
138
        if (count($items) == 0)
139
        {
140
            // This feed didn't return any items, skip
141
            return array();
142
        }
143
144
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
145
        $items = array_reverse($items);
146
147
        foreach ($items as $item)
148
        {
149
            if ($guid = $this->import_item($item))
150
            {
151
                $item->set_local_guid($guid);
152
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
153
            }
154
            else
155
            {
156
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
157
            }
158
        }
159
160
        $this->clean($items);
161
162
        return array_reverse($items);
163
    }
164
165
    /**
166
     * Imports a feed item into the database
167
     *
168
     * @param net_nemein_rss_parser_item $item Feed item as provided by SimplePie
169
     */
170
    function import_item(net_nemein_rss_parser_item $item)
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
171
    {
172
        switch ($this->_node->component)
173
        {
174
            case 'net.nehmer.blog':
175
                return $this->import_article($item);
176
177
            case 'net.nemein.calendar':
178
                //return $this->import_event($item);
179
                throw new midcom_error('Event importing has to be re-implemented with SimplePie API');
180
181
            default:
182
                /**
183
                 * This will totally break cron if someone made something stupid (like changed folder component)
184
                 * on folder that had subscriptions
185
                 *
186
                throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
187
                 */
188
                debug_add("RSS fetching for component {$this->_node->component} is unsupported", MIDCOM_LOG_ERROR);
189
                return false;
190
        }
191
    }
192
193
    /**
194
     * Imports an item as a news article
195
     */
196
    private function import_article(net_nemein_rss_parser_item $item)
197
    {
198
        $guid = $item->get_id();
199
        $title = $item->get_title();
200
201
        if (   (   empty($title)
202
                || trim($title) == '...')
203
            && empty($guid))
204
        {
205
            // Something wrong with this entry, skip it
206
            return false;
207
        }
208
209
        $article = $this->find_article($item, $guid);
210
        if (!$article)
211
        {
212
            return false;
213
        }
214
215
        $article->allow_name_catenate = true;
216
        $article->_activitystream_verb = 'http://community-equity.org/schema/1.0/clone';
217
        $article->_rcs_message = sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title);
218
219
        $values = array
220
        (
221
            'title' => $title,
222
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
223
            'content' => $item->get_content(),
224
            'url' => $item->get_link(),
225
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
226
        );
227
        $meta_values = array();
228
229
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
230
        if (empty($article->name))
231
        {
232
            $generator = midcom::get()->serviceloader->load('midcom_core_service_urlgenerator');
233
            $values['name'] = $generator->from_string($title);
234
        }
235
236
        $categories = $item->get_categories();
237
        if (is_array($categories))
238
        {
239
            // Handle categories provided in the feed
240
            foreach ($categories as $category)
241
            {
242
                // Clean up the categories and save
243
                $category = str_replace('|', '_', trim($category->get_term()));
244
                $values['extra1'] .= "{$category}|";
245
            }
246
        }
247
248
        $article_author = $this->find_author($item);
249
        if (!empty($article_author->guid))
250
        {
251
            $meta_values['authors'] = "|{$article_author->guid}|";
252
        }
253
254
        // Try to figure out item publication date
255
        $article_date = $item->get_date('U');
256
257
        $article_data_tweaked = false;
258
        if (!$article_date)
259
        {
260
            $article_date = time();
261
            $article_data_tweaked = true;
262
        }
263
264
        if ($article_date > $this->_feed->latestupdate)
265
        {
266
            // Cache "latest updated" time to feed
267
            $this->_feed->latestupdate = $article_date;
268
            $this->_feed->_use_activitystream = false;
269
            $this->_feed->_use_rcs = false;
270
            $this->_feed->update();
271
        }
272
273
        if ($article->id)
274
        {
275
            if (!$article_data_tweaked)
276
            {
277
                $meta_values['published'] = $article_date;
278
            }
279
280
            if (   $this->apply_values($article, $values, $meta_values)
281
                && !$article->update())
282
            {
283
                return false;
284
            }
285
        }
286
        else
287
        {
288
            $this->apply_values($article, $values, $meta_values);
289
            if (!$article->create())
290
            {
291
                return false;
292
            }
293
        }
294
295
        if ($this->_feed->autoapprove)
296
        {
297
            $article->metadata->approve();
298
        }
299
300
        $this->_parse_tags($article);
301
        $this->_parse_parameters($article, $item);
302
303
        // store <link rel="replies"> url in parameter
304
        if ($item->get_link(0, 'replies'))
0 ignored issues
show
Bug Best Practice introduced by
The expression $item->get_link(0, 'replies') of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
305
        {
306
            $article->set_parameter('net.nemein.rss', 'replies_url', $item->get_link(0, 'replies'));
307
        }
308
309
        return $article->guid;
310
    }
311
312
    private function find_author(net_nemein_rss_parser_item $item)
313
    {
314
        // Try to figure out item author
315
        if (   $this->_feed->forceauthor
316
            && $this->_feed->defaultauthor)
317
        {
318
            // Feed has a "default author" set, use it
319
            return new midcom_db_person($this->_feed->defaultauthor);
320
        }
321
        $author = $this->match_item_author($item);
322
        $fallback_person_id = 1;
323
        if (   !$author
324
            || $author->id == $fallback_person_id)
325
        {
326
            if ($this->_feed->defaultauthor)
327
            {
328
                // Feed has a "default author" set, use it
329
                $author = new midcom_db_person($this->_feed->defaultauthor);
330
            }
331
            else
332
            {
333
                // Fall back to "Midgard Admin" just in case
334
                $author = new midcom_db_person($fallback_person_id);
335
            }
336
        }
337
        return $author;
338
    }
339
340
    private function find_article(net_nemein_rss_parser_item $item, $guid)
341
    {
342
        $qb = midcom_db_article::new_query_builder();
343
        $qb->add_constraint('topic', '=', $this->_feed->node);
344
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
345
        $articles = $qb->execute();
346
        if (count($articles) > 0)
347
        {
348
            // This item has been imported already earlier. Update
349
            return $articles[0];
350
        }
351
352
        // Check against duplicate hits that may come from different feeds
353
        if ($link = $item->get_link())
354
        {
355
            $qb = midcom_db_article::new_query_builder();
356
            $qb->add_constraint('topic', '=', $this->_feed->node);
357
            $qb->add_constraint('url', '=', $link);
358
            if ($qb->count() > 0)
359
            {
360
                // Dupe, skip
361
                return false;
362
            }
363
        }
364
365
        // This is a new item
366
        $article = new midcom_db_article();
367
        $article->topic = $this->_feed->node;
368
369
        $node = new midcom_db_topic($this->_feed->node);
370
        if ($symlink = $node->get_parameter('net.nehmer.blog', 'symlink_topic'))
371
        {
372
            try
373
            {
374
                $symlink_topic = new midcom_db_topic($symlink);
375
                $article->topic = $symlink_topic->id;
376
            }
377
            catch (midcom_error $e)
378
            {
379
                $e->log();
380
            }
381
        }
382
        if ($node_lang_code = $node->get_parameter('net.nehmer.blog', 'language'))
383
        {
384
            $lang_id = midcom::get()->i18n->code_to_id($node_lang_code);
385
            $article->lang = $lang_id;
386
        }
387
        return $article;
388
    }
389
390
    private function apply_values(midcom_db_article $article, array $values, array $meta_values)
391
    {
392
        $updated = false;
393
394
        foreach ($values as $fieldname => $value)
395
        {
396
            if ($article->$fieldname !== $value)
397
            {
398
                $article->$fieldname = $value;
399
                $updated = true;
400
            }
401
        }
402
403
        foreach ($meta_values as $fieldname => $value)
404
        {
405
            if ($article->metadata->$fieldname !== $value)
406
            {
407
                $article->metadata->$fieldname = $value;
408
                $updated = true;
409
            }
410
        }
411
412
        return $updated;
413
    }
414
415
    /**
416
     * Cleans up old, removed items from feeds
417
     *
418
     * @param array $items Feed item as provided by SimplePie
419
     */
420
    function clean($items)
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
421
    {
422
        if ($this->_feed->keepremoved)
423
        {
424
            // This feed is set up so that we retain items removed from array
425
            return false;
426
        }
427
428
        // Create array of item GUIDs
429
        $item_guids = array();
430
        foreach ($items as $item)
431
        {
432
            $item_guids[] = $item->get_id();
433
        }
434
435
        // Find articles resulting from this feed
436
        $qb = midcom_db_article::new_query_builder();
437
        $feed_category = md5($this->_feed->url);
438
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
439
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
440
        $local_items = $qb->execute_unchecked();
441
        $purge_guids = array();
442
        foreach ($local_items as $item)
0 ignored issues
show
Bug introduced by
The expression $local_items of type array|false is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
443
        {
444
            if (   midcom::get()->componentloader->is_installed('net.nemein.favourites')
445
                && midcom::get()->componentloader->load_graceful('net.nemein.favourites'))
446
            {
447
                // If it has been favorited keep it
448
                $qb = net_nemein_favourites_favourite_dba::new_query_builder();
449
                $qb->add_constraint('objectGuid', '=', $item->guid);
450
                if ($qb->count_unchecked() > 0)
451
                {
452
                    continue;
453
                    // Skip deleting this one
454
                }
455
            }
456
457
            $purge_guids[] = $item->guid;
458
            $item->delete();
459
        }
460
461
        midcom_baseclasses_core_dbobject::purge($purge_guids, 'midgard_article');
462
    }
463
464
    /**
465
     * Parses author formats used by different feed standards and
466
     * and returns the information
467
     *
468
     * @param net_nemein_rss_parser_item $item Feed item as provided by SimplePie
469
     * @return Array Information found
470
     */
471
    function parse_item_author(net_nemein_rss_parser_item $item)
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
472
    {
473
        $author_info = array();
474
475
        $author = $item->get_author();
476
477
        // First try dig up any information about the author possible
478
        if (!empty($author))
479
        {
480
            $name = $author->get_name();
481
            $email = $author->get_email();
482
            if (!empty($name))
483
            {
484
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
485
                // Atom feed, the value can be either full name or username
486
                $author_info['user_or_full'] = $name;
487
            }
488
            else
489
            {
490
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
491
            }
492
493
            if (!preg_match('/(<|\()/', $name))
494
            {
495
                $author_info['user_or_full'] = $name;
496
            }
497
            else
498
            {
499
                if (strstr($name, '<'))
500
                {
501
                    // The classic "Full Name <email>" format
502
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
503
                }
504
                else
505
                {
506
                    // The classic "email (Full Name)" format
507
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
508
                }
509
                if (preg_match($regex, $name, $matches))
510
                {
511
                    $author_info['email'] = $matches['email'];
512
                    $author_info['user_or_full'] = $matches['fullname'];
513
                }
514
            }
515
        }
516
517
        if (isset($author_info['user_or_full']))
518
        {
519
            if (strstr($author_info['user_or_full'], ' '))
520
            {
521
                // This value has a space in it, assuming full name
522
                $author_info['full_name'] = $author_info['user_or_full'];
523
            }
524
            else
525
            {
526
                $author_info['username'] = $author_info['user_or_full'];
527
            }
528
            unset($author_info['user_or_full']);
529
        }
530
531
        return $author_info;
532
    }
533
534
    /**
535
     * Parses author formats used by different feed standards and
536
     * tries to match to persons in database.
537
     *
538
     * @param net_nemein_rss_parser_item $item Feed item as provided by SimplePie
539
     * @return midcom_db_person Person object matched, or null
540
     */
541
    function match_item_author(net_nemein_rss_parser_item $item)
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
542
    {
543
        // Parse the item for author information
544
        $author_info = $this->parse_item_author($item);
545
546
        if (!empty($author_info['email']))
547
        {
548
            // Email is a pretty good identifier, start with it
549
            $person_qb = midcom_db_person::new_query_builder();
550
            $person_qb->add_constraint('email', '=', $author_info['email']);
551
            $persons = $person_qb->execute();
552
            if (count($persons) > 0)
553
            {
554
                return $persons[0];
555
            }
556
        }
557
558
        if (!empty($author_info['username']))
559
        {
560
            if ($person = midcom::get()->auth->get_user_by_name($author_info['username']))
561
            {
562
                return $person->get_storage();
563
            }
564
        }
565
566
        if (!empty($author_info['full_name']))
567
        {
568
            $name_parts = explode(' ', $author_info['full_name']);
569
            if (count($name_parts) > 1)
570
            {
571
                // We assume the western format Firstname Lastname
572
                $firstname = $name_parts[0];
573
                $lastname = $name_parts[1];
574
575
                $person_qb = midcom_db_person::new_query_builder();
576
                $person_qb->add_constraint('firstname', '=', $firstname);
577
                $person_qb->add_constraint('lastname', '=', $lastname);
578
                $persons = $person_qb->execute();
579
                if (count($persons) > 0)
580
                {
581
                    return $persons[0];
582
                }
583
            }
584
        }
585
586
        return null;
587
    }
588
589
    /**
590
     * Parses additional metadata in RSS item and sets parameters accordingly
591
     *
592
     * @param midcom_core_dbaobject $article Imported article
593
     * @param net_nemein_rss_parser_item $item Feed item as provided by SimplePie
594
     */
595
    private function _parse_parameters(midcom_core_dbaobject $article, net_nemein_rss_parser_item $item)
596
    {
597
        foreach ($item->get_enclosures() as $enclosure)
0 ignored issues
show
Bug introduced by
The expression $item->get_enclosures() of type array|null is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
598
        {
599
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
600
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
601
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
602
        }
603
    }
604
605
    /**
606
     * Parses rel-tag links in article content and tags the object based on them
607
     *
608
     * @param midgard_article $article Imported article
609
     */
610
    private function _parse_tags($article, $field = 'content')
611
    {
612
        $html_tags = org_openpsa_httplib_helpers::get_anchor_values($article->$field, 'tag');
613
        $tags = array();
614
615
        if (count($html_tags) > 0)
616
        {
617
            foreach ($html_tags as $html_tag)
618
            {
619
                if (!$html_tag['value'])
620
                {
621
                    // No actual tag specified, skip
622
                    continue;
623
                }
624
625
                $tag = strtolower(strip_tags($html_tag['value']));
626
                $tags[$tag] = $html_tag['href'];
627
            }
628
629
            return net_nemein_tag_handler::tag_object($article, $tags);
630
        }
631
    }
632
}
633