Completed
Push — master ( 213f30...2ef5cc )
by Andreas
23:06
created

net_nemein_rss_fetch::get_parser()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 5
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 7
ccs 6
cts 6
cp 1
crap 1
rs 10
1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
use Symfony\Component\DomCrawler\Crawler;
10
11
/**
12
 * RSS and Atom feed fetching class. Caches the fetched items as articles
13
 * in net.nehmer.blog
14
 *
15
 * @package net.nemein.rss
16
 */
17
class net_nemein_rss_fetch extends midcom_baseclasses_components_purecode
18
{
19
    /**
20
     * The last error reported by SimplePie, if any
21
     */
22
    public $lasterror;
23
24
    /**
25
     * The feed object we're fetching
26
     */
27
    private $_feed;
28
29
    /**
30
     * Property of midcom_db_article we're using for storing the feed item GUIDs
31
     */
32
    private $_guid_property = 'extra2';
33
34
    /**
35
     * Current node we're importing to
36
     *
37
     * @var midcom_db_topic
38
     */
39
    private $_node;
40
41
    /**
42
     * Initializes the class with a given feed
43
     */
44 2
    public function __construct(net_nemein_rss_feed_dba $feed)
45
    {
46 2
        $this->_feed = $feed;
47
48 2
        $this->_node = new midcom_db_topic($this->_feed->node);
49
50 2
        parent::__construct();
51 2
    }
52
53 2
    public static function get_parser() : SimplePie
54
    {
55 2
        $parser = new SimplePie;
56 2
        $parser->get_registry()->register('Item', net_nemein_rss_parser_item::class);
57 2
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
58 2
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
59 2
        return $parser;
60
    }
61
62
    /**
63
     * Actually fetch a feed
64
     */
65
    public static function raw_fetch(string $url) : SimplePie
66
    {
67
        $parser = self::get_parser();
68
        $parser->set_feed_url($url);
69
        $parser->init();
70
        return $parser;
71
    }
72
73
    /**
74
     * Fetch given RSS or Atom feed
75
     *
76
     * @return net_nemein_rss_parser_item[] Array of normalized feed items
77
     */
78
    function fetch() : array
79
    {
80
        $parser = self::raw_fetch($this->_feed->url);
81
        if ($parser->error()) {
82
            $this->lasterror = $parser->error();
83
            return [];
84
        }
85
        if (!empty($parser->data['headers']['etag'])) {
86
            // Etag checking
87
            $etag = trim($parser->data['headers']['etag']);
88
89
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
90
            if (   !empty($feed_etag)
91
                && $feed_etag == $etag) {
92
                // Feed hasn't changed, skip updating
93
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
94
                return [];
95
            }
96
97
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
98
        }
99
100
        $this->_feed->latestfetch = time();
101
        $this->_feed->_use_rcs = false;
102
        $this->_feed->update();
103
104
        return $parser->get_items();
105
    }
106
107
    /**
108
     * Fetches and imports items in the feed
109
     */
110
    public function import() : array
111
    {
112
        if (!$this->_node->component) {
113
            return [];
114
        }
115
116
        $items = $this->fetch();
117
118
        if (empty($items)) {
119
            // This feed didn't return any items, skip
120
            return [];
121
        }
122
123
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
124
        $items = array_reverse($items);
125
126
        foreach ($items as $item) {
127
            if ($guid = $this->import_item($item)) {
128
                $item->set_local_guid($guid);
129
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
130
            } else {
131
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
132
            }
133
        }
134
135
        $this->clean($items);
136
137
        return array_reverse($items);
138
    }
139
140
    /**
141
     * Imports a feed item into the database
142
     *
143
     * @param net_nemein_rss_parser_item $item Feed item as provided by SimplePie
144
     */
145 1
    public function import_item(net_nemein_rss_parser_item $item) : ?string
146
    {
147 1
        if ($this->_node->component !== 'net.nehmer.blog') {
148
            throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
149
        }
150 1
        return $this->import_article($item);
151
    }
152
153
    /**
154
     * Imports an item as a news article
155
     */
156 1
    private function import_article(net_nemein_rss_parser_item $item) : ?string
157
    {
158 1
        $guid = $item->get_id();
159 1
        $title = $item->get_title();
160
161 1
        if (   (   empty($title)
162 1
                || trim($title) == '...')
163 1
            && empty($guid)) {
164
            // Something wrong with this entry, skip it
165
            return null;
166
        }
167
168 1
        $article = $this->find_article($item, $guid);
169 1
        if (!$article) {
170
            return null;
171
        }
172
173 1
        $article->allow_name_catenate = true;
174 1
        $article->set_rcs_message(sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title));
175
176
        $values = [
177 1
            'title' => $title,
178 1
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
179 1
            'content' => $item->get_content(),
180 1
            'url' => $item->get_link(),
181 1
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
182
        ];
183 1
        $meta_values = [];
184
185
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
186 1
        if (empty($article->name)) {
187 1
            $values['name'] = midcom_helper_misc::urlize($title);
188
        }
189
190 1
        $categories = $item->get_categories();
191 1
        if (is_array($categories)) {
192
            // Handle categories provided in the feed
193 1
            foreach ($categories as $category) {
194
                // Clean up the categories and save
195 1
                $category = str_replace('|', '_', trim($category->get_term()));
196 1
                $values['extra1'] .= "{$category}|";
197
            }
198
        }
199
200 1
        $article_author = $this->find_author($item);
201 1
        if (!empty($article_author->guid)) {
202 1
            $meta_values['authors'] = "|{$article_author->guid}|";
203
        }
204
205
        // Try to figure out item publication date
206 1
        $article_date = $item->get_date('U');
207
208 1
        $article_data_tweaked = false;
209 1
        if (!$article_date) {
210
            $article_date = time();
211
            $article_data_tweaked = true;
212
        }
213
214 1
        if ($article_date > $this->_feed->latestupdate) {
215
            // Cache "latest updated" time to feed
216 1
            $this->_feed->latestupdate = $article_date;
217 1
            $this->_feed->_use_rcs = false;
218 1
            $this->_feed->update();
219
        }
220
221 1
        if ($article->id) {
222 1
            if (!$article_data_tweaked) {
223 1
                $meta_values['published'] = $article_date;
224
            }
225
226 1
            if (   $this->apply_values($article, $values, $meta_values)
227 1
                && !$article->update()) {
228 1
                return null;
229
            }
230
        } else {
231 1
            $this->apply_values($article, $values, $meta_values);
232 1
            if (!$article->create()) {
233
                return null;
234
            }
235
        }
236
237 1
        if ($this->_feed->autoapprove) {
238
            $article->metadata->approve();
239
        }
240
241 1
        $this->_parse_tags($article);
242 1
        $this->_parse_parameters($article, $item);
243
244
        // store <link rel="replies"> url in parameter
245 1
        if ($item->get_link(0, 'replies')) {
246
            $article->set_parameter('net.nemein.rss', 'replies_url', $item->get_link(0, 'replies'));
247
        }
248
249 1
        return $article->guid;
250
    }
251
252 1
    private function find_author(net_nemein_rss_parser_item $item) : midcom_db_person
253
    {
254
        // Try to figure out item author
255 1
        if (   $this->_feed->forceauthor
256 1
            && $this->_feed->defaultauthor) {
257
            // Feed has a "default author" set, use it
258
            return new midcom_db_person($this->_feed->defaultauthor);
259
        }
260 1
        $author = $this->match_item_author($item);
261 1
        $fallback_person_id = 1;
262 1
        if (   !$author
263 1
            || $author->id == $fallback_person_id) {
264 1
            if ($this->_feed->defaultauthor) {
265
                // Feed has a "default author" set, use it
266
                $author = new midcom_db_person($this->_feed->defaultauthor);
267
            } else {
268
                // Fall back to "Midgard Admin" just in case
269 1
                $author = new midcom_db_person($fallback_person_id);
270
            }
271
        }
272 1
        return $author;
273
    }
274
275 1
    private function find_article(net_nemein_rss_parser_item $item, string $guid) : ?midcom_db_article
276
    {
277 1
        $qb = midcom_db_article::new_query_builder();
278 1
        $qb->add_constraint('topic', '=', $this->_feed->node);
279 1
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
280 1
        $articles = $qb->execute();
281 1
        if (!empty($articles)) {
282
            // This item has been imported already earlier. Update
283 1
            return $articles[0];
284
        }
285
286
        // Check against duplicate hits that may come from different feeds
287 1
        if ($link = $item->get_link()) {
288 1
            $qb = midcom_db_article::new_query_builder();
289 1
            $qb->add_constraint('topic', '=', $this->_feed->node);
290 1
            $qb->add_constraint('url', '=', $link);
291 1
            if ($qb->count() > 0) {
292
                // Dupe, skip
293
                return null;
294
            }
295
        }
296
297
        // This is a new item
298 1
        $article = new midcom_db_article();
299 1
        $article->topic = $this->_feed->node;
300 1
        return $article;
301
    }
302
303 1
    private function apply_values(midcom_db_article $article, array $values, array $meta_values) : bool
304
    {
305 1
        $updated = false;
306
307 1
        foreach ($values as $fieldname => $value) {
308 1
            if ($article->$fieldname !== $value) {
309 1
                $article->$fieldname = $value;
310 1
                $updated = true;
311
            }
312
        }
313
314 1
        foreach ($meta_values as $fieldname => $value) {
315 1
            if ($article->metadata->$fieldname !== $value) {
316 1
                $article->metadata->$fieldname = $value;
317 1
                $updated = true;
318
            }
319
        }
320
321 1
        return $updated;
322
    }
323
324
    /**
325
     * Cleans up old, removed items from feeds
326
     *
327
     * @param array $items Feed item as provided by SimplePie
328
     */
329
    function clean($items)
330
    {
331
        if ($this->_feed->keepremoved) {
332
            // This feed is set up so that we retain items removed from array
333
            return false;
334
        }
335
336
        // Create array of item GUIDs
337
        $item_guids = [];
338
        foreach ($items as $item) {
339
            $item_guids[] = $item->get_id();
340
        }
341
342
        // Find articles resulting from this feed
343
        $qb = midcom_db_article::new_query_builder();
344
        $feed_category = md5($this->_feed->url);
345
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
346
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
347
        $local_items = $qb->execute_unchecked();
348
        $purge_guids = [];
349
        foreach ($local_items as $item) {
350
            $purge_guids[] = $item->guid;
351
            $item->delete();
352
        }
353
354
        midcom_baseclasses_core_dbobject::purge($purge_guids, 'midgard_article');
355
    }
356
357
    /**
358
     * Parses author formats used by different feed standards and
359
     * and returns the information
360
     */
361 2
    public static function parse_item_author(net_nemein_rss_parser_item $item) : array
362
    {
363 2
        $author_info = [];
364
365 2
        $author = $item->get_author();
366
367
        // First try dig up any information about the author possible
368 2
        if (!empty($author)) {
369 2
            $name = $author->get_name();
370 2
            $email = $author->get_email();
371 2
            if (!empty($name)) {
372
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
373
                // Atom feed, the value can be either full name or username
374
                $author_info['user_or_full'] = $name;
375
            } else {
376 2
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
377
            }
378
379 2
            if (!preg_match('/(<|\()/', $name)) {
380 1
                $author_info['user_or_full'] = $name;
381
            } else {
382 2
                if (strstr($name, '<')) {
383
                    // The classic "Full Name <email>" format
384 1
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
385
                } else {
386
                    // The classic "email (Full Name)" format
387 1
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
388
                }
389 2
                if (preg_match($regex, $name, $matches)) {
390 2
                    $author_info['email'] = $matches['email'];
391 2
                    $author_info['user_or_full'] = $matches['fullname'];
392
                }
393
            }
394
        }
395
396 2
        if (isset($author_info['user_or_full'])) {
397 2
            $author_info['user_or_full'] = trim($author_info['user_or_full']);
398 2
            if (strstr($author_info['user_or_full'], ' ')) {
399
                // This value has a space in it, assuming full name
400 2
                $author_info['full_name'] = $author_info['user_or_full'];
401
            } else {
402 1
                $author_info['username'] = $author_info['user_or_full'];
403
            }
404 2
            unset($author_info['user_or_full']);
405
        }
406
407 2
        return $author_info;
408
    }
409
410
    /**
411
     * Parses author formats used by different feed standards and
412
     * tries to match to persons in database.
413
     *
414
     * @param net_nemein_rss_parser_item $item Feed item as provided by SimplePie
415
     * @return midcom_db_person Person object matched, or null
416
     */
417 2
    public function match_item_author(net_nemein_rss_parser_item $item) : ?midcom_db_person
418
    {
419
        // Parse the item for author information
420 2
        $author_info = self::parse_item_author($item);
421
422 2
        if (!empty($author_info['email'])) {
423
            // Email is a pretty good identifier, start with it
424 2
            $person_qb = midcom_db_person::new_query_builder();
425 2
            $person_qb->add_constraint('email', '=', $author_info['email']);
426 2
            $persons = $person_qb->execute();
427 2
            if (!empty($persons)) {
428 1
                return $persons[0];
429
            }
430
        }
431
432 2
        if (   !empty($author_info['username'])
433 2
            && $person = midcom::get()->auth->get_user_by_name($author_info['username'])) {
434 1
            return $person->get_storage();
435
        }
436
437 2
        if (!empty($author_info['full_name'])) {
438 2
            $name_parts = explode(' ', $author_info['full_name']);
439 2
            if (count($name_parts) > 1) {
440
                // We assume the western format Firstname Lastname
441 2
                $firstname = $name_parts[0];
442 2
                $lastname = $name_parts[1];
443
444 2
                $person_qb = midcom_db_person::new_query_builder();
445 2
                $person_qb->add_constraint('firstname', '=', $firstname);
446 2
                $person_qb->add_constraint('lastname', '=', $lastname);
447 2
                $persons = $person_qb->execute();
448 2
                if (!empty($persons)) {
449 1
                    return $persons[0];
450
                }
451
            }
452
        }
453
454 1
        return null;
455
    }
456
457
    /**
458
     * Parses additional metadata in RSS item and sets parameters accordingly
459
     */
460 1
    private function _parse_parameters(midcom_db_article $article, net_nemein_rss_parser_item $item)
461
    {
462 1
        foreach ($item->get_enclosures() as $enclosure) {
463 1
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
464 1
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
465 1
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
466
        }
467 1
    }
468
469
    /**
470
     * Parses rel-tag links in article content and tags the object based on them
471
     */
472 1
    private function _parse_tags(midcom_db_article $article)
473
    {
474 1
        $crawler = new Crawler($article->content);
475 1
        $nodes = $crawler->filter('a[rel="tag"]');
476
477
        $html_tags = $nodes->each(function(Crawler $node, $i) {
478
            return [
479 1
                'href' => $node->attr('href') ?? false,
480 1
                'value' => $node->text() ?? false,
481
            ];
482 1
        });
483
484 1
        $tags = [];
485
486 1
        foreach ($html_tags as $html_tag) {
487 1
            if (!$html_tag['value']) {
488
                // No actual tag specified, skip
489
                continue;
490
            }
491
492 1
            $tag = strtolower(strip_tags($html_tag['value']));
493 1
            $tags[$tag] = $html_tag['href'];
494
        }
495 1
        if (!empty($tags)) {
496 1
            net_nemein_tag_handler::tag_object($article, $tags);
497
        }
498 1
    }
499
}
500