Passed
Push — master ( 40aec1...d8787f )
by Andreas
11:30
created

net_nemein_rss_fetch::fetch()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 27
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 30

Importance

Changes 0
Metric Value
cc 5
eloc 16
nc 4
nop 0
dl 0
loc 27
ccs 0
cts 17
cp 0
crap 30
rs 9.4222
c 0
b 0
f 0
1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
use Symfony\Component\DomCrawler\Crawler;
10
use midcom\dba\softdelete;
11
use SimplePie\SimplePie;
12
13
/**
14
 * RSS and Atom feed fetching class. Caches the fetched items as articles
15
 * in net.nehmer.blog
16
 *
17
 * @package net.nemein.rss
18
 */
19
class net_nemein_rss_fetch
20
{
21
    /**
22
     * The last error reported by SimplePie, if any
23
     */
24
    public $lasterror;
25
26
    private net_nemein_rss_feed_dba $_feed;
27
28
    /**
29
     * Property of midcom_db_article we're using for storing the feed item GUIDs
30
     */
31
    private string $_guid_property = 'extra2';
32
33
    /**
34
     * Current node we're importing to
35
     */
36
    private midcom_db_topic $_node;
37
38
    /**
39
     * Initializes the class with a given feed
40
     */
41 2
    public function __construct(net_nemein_rss_feed_dba $feed)
42
    {
43 2
        $this->_feed = $feed;
44 2
        $this->_node = new midcom_db_topic($feed->node);
45
    }
46
47 2
    public static function get_parser() : SimplePie
48
    {
49 2
        $parser = new SimplePie;
50 2
        $parser->get_registry()->register('Item', net_nemein_rss_parser_item::class);
51 2
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
52 2
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
53 2
        return $parser;
54
    }
55
56
    /**
57
     * Actually fetch a feed
58
     */
59
    public static function raw_fetch(string $url) : SimplePie
60
    {
61
        $parser = self::get_parser();
62
        $parser->set_feed_url($url);
63
        $parser->init();
64
        return $parser;
65
    }
66
67
    /**
68
     * Fetch given RSS or Atom feed
69
     *
70
     * @return net_nemein_rss_parser_item[] Array of normalized feed items
71
     */
72
    private function fetch() : array
73
    {
74
        $parser = self::raw_fetch($this->_feed->url);
75
        if ($parser->error()) {
76
            $this->lasterror = $parser->error();
77
            return [];
78
        }
79
        if (!empty($parser->data['headers']['etag'])) {
80
            // Etag checking
81
            $etag = trim($parser->data['headers']['etag']);
82
83
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
84
            if (   !empty($feed_etag)
85
                && $feed_etag == $etag) {
86
                // Feed hasn't changed, skip updating
87
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
88
                return [];
89
            }
90
91
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
92
        }
93
94
        $this->_feed->latestfetch = time();
95
        $this->_feed->_use_rcs = false;
96
        $this->_feed->update();
97
98
        return $parser->get_items();
99
    }
100
101
    /**
102
     * Fetches and imports items in the feed
103
     */
104
    public function import() : array
105
    {
106
        if (!$this->_node->component) {
107
            return [];
108
        }
109
110
        $items = $this->fetch();
111
112
        if (empty($items)) {
113
            // This feed didn't return any items, skip
114
            return [];
115
        }
116
117
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
118
        $items = array_reverse($items);
119
120
        foreach ($items as $item) {
121
            if ($guid = $this->import_item($item)) {
122
                $item->set_local_guid($guid);
123
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
124
            } else {
125
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
126
            }
127
        }
128
129
        $this->clean($items);
130
131
        return array_reverse($items);
132
    }
133
134
    /**
135
     * Imports a feed item into the database
136
     */
137 1
    public function import_item(net_nemein_rss_parser_item $item) : ?string
138
    {
139 1
        if ($this->_node->component !== 'net.nehmer.blog') {
140
            throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
141
        }
142 1
        return $this->import_article($item);
143
    }
144
145
    /**
146
     * Imports an item as a news article
147
     */
148 1
    private function import_article(net_nemein_rss_parser_item $item) : ?string
149
    {
150 1
        $guid = $item->get_id();
151 1
        $title = $item->get_title();
152
153 1
        if (   (   empty($title)
154 1
                || trim($title) == '...')
155 1
            && empty($guid)) {
156
            // Something wrong with this entry, skip it
157
            return null;
158
        }
159
160 1
        $article = $this->find_article($item, $guid);
161 1
        if (!$article) {
162
            return null;
163
        }
164
165 1
        $article->allow_name_catenate = true;
166 1
        $article->set_rcs_message(sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title));
167
168 1
        $values = [
169 1
            'title' => $title,
170 1
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
171 1
            'content' => $item->get_content(),
172 1
            'url' => $item->get_link(),
173 1
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
174 1
        ];
175 1
        $meta_values = [];
176
177
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
178 1
        if (empty($article->name)) {
179 1
            $values['name'] = midcom_helper_misc::urlize($title);
180
        }
181
182 1
        $categories = $item->get_categories();
183 1
        if (is_array($categories)) {
184
            // Handle categories provided in the feed
185 1
            foreach ($categories as $category) {
186
                // Clean up the categories and save
187 1
                $category = str_replace('|', '_', trim($category->get_term()));
188 1
                $values['extra1'] .= "{$category}|";
189
            }
190
        }
191
192 1
        $article_author = $this->find_author($item);
193 1
        if (!empty($article_author->guid)) {
194 1
            $meta_values['authors'] = "|{$article_author->guid}|";
195
        }
196
197
        // Try to figure out item publication date
198 1
        if ($article_date = $item->get_date('U')) {
199 1
            $meta_values['published'] = $article_date;
200
        } else {
201
            $article_date = time();
202
            if (!$article->id) {
203
                $meta_values['published'] = $article_date;
204
            }
205
        }
206
207 1
        if ($article_date > $this->_feed->latestupdate) {
208
            // Cache "latest updated" time to feed
209 1
            $this->_feed->latestupdate = $article_date;
210 1
            $this->_feed->_use_rcs = false;
211 1
            $this->_feed->update();
212
        }
213
214 1
        if ($article->id) {
215 1
            if (   $this->apply_values($article, $values, $meta_values)
216 1
                && !$article->update()) {
217 1
                return null;
218
            }
219
        } else {
220 1
            $this->apply_values($article, $values, $meta_values);
221 1
            if (!$article->create()) {
222
                return null;
223
            }
224
        }
225
226 1
        if ($this->_feed->autoapprove) {
227
            $article->metadata->approve();
228
        }
229
230 1
        $this->_parse_tags($article);
231 1
        $this->_parse_parameters($article, $item);
232
233
        // store <link rel="replies"> url in parameter
234 1
        if ($link = $item->get_link(rel: 'replies')) {
235
            $article->set_parameter('net.nemein.rss', 'replies_url', $link);
236
        }
237
238 1
        return $article->guid;
239
    }
240
241 1
    private function find_author(net_nemein_rss_parser_item $item) : midcom_db_person
242
    {
243
        // Try to figure out item author
244 1
        if (   $this->_feed->forceauthor
245 1
            && $this->_feed->defaultauthor) {
246
            // Feed has a "default author" set, use it
247
            return new midcom_db_person($this->_feed->defaultauthor);
248
        }
249 1
        $author = $this->match_item_author($item);
250 1
        $fallback_person_id = 1;
251 1
        if (   !$author
252 1
            || $author->id == $fallback_person_id) {
253 1
            if ($this->_feed->defaultauthor) {
254
                // Feed has a "default author" set, use it
255
                $author = new midcom_db_person($this->_feed->defaultauthor);
256
            } else {
257
                // Fall back to "Midgard Admin" just in case
258 1
                $author = new midcom_db_person($fallback_person_id);
259
            }
260
        }
261 1
        return $author;
262
    }
263
264 1
    private function find_article(net_nemein_rss_parser_item $item, string $guid) : ?midcom_db_article
265
    {
266 1
        $qb = midcom_db_article::new_query_builder();
267 1
        $qb->add_constraint('topic', '=', $this->_feed->node);
268 1
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
269 1
        if ($articles = $qb->execute()) {
270
            // This item has been imported already earlier. Update
271 1
            return $articles[0];
272
        }
273
274
        // Check against duplicate hits that may come from different feeds
275 1
        if ($link = $item->get_link()) {
276 1
            $qb = midcom_db_article::new_query_builder();
277 1
            $qb->add_constraint('topic', '=', $this->_feed->node);
278 1
            $qb->add_constraint('url', '=', $link);
279 1
            if ($qb->count() > 0) {
280
                // Dupe, skip
281
                return null;
282
            }
283
        }
284
285
        // This is a new item
286 1
        $article = new midcom_db_article();
287 1
        $article->topic = $this->_feed->node;
288 1
        return $article;
289
    }
290
291 1
    private function apply_values(midcom_db_article $article, array $values, array $meta_values) : bool
292
    {
293 1
        $updated = false;
294
295 1
        foreach ($values as $fieldname => $value) {
296 1
            if ($article->$fieldname !== $value) {
297 1
                $article->$fieldname = $value;
298 1
                $updated = true;
299
            }
300
        }
301
302 1
        foreach ($meta_values as $fieldname => $value) {
303 1
            if ($article->metadata->$fieldname !== $value) {
304 1
                $article->metadata->$fieldname = $value;
305 1
                $updated = true;
306
            }
307
        }
308
309 1
        return $updated;
310
    }
311
312
    /**
313
     * Cleans up old, removed items from feeds
314
     *
315
     * @param net_nemein_rss_parser_item[] $items
316
     */
317
    private function clean(array $items)
318
    {
319
        if ($this->_feed->keepremoved) {
320
            // This feed is set up so that we retain items removed from array
321
            return;
322
        }
323
324
        // Create array of item GUIDs
325
        $item_guids = [];
326
        foreach ($items as $item) {
327
            $item_guids[] = $item->get_id();
328
        }
329
330
        // Find articles resulting from this feed
331
        $qb = midcom_db_article::new_query_builder();
332
        $feed_category = md5($this->_feed->url);
333
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
334
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
335
        $local_items = $qb->execute_unchecked();
336
        $purge_guids = [];
337
        foreach ($local_items as $item) {
338
            $purge_guids[] = $item->guid;
339
            $item->delete();
340
        }
341
342
        softdelete::purge($purge_guids, 'midgard_article');
343
    }
344
345
    /**
346
     * Parses author formats used by different feed standards and
347
     * and returns the information
348
     */
349 2
    public static function parse_item_author(net_nemein_rss_parser_item $item) : array
350
    {
351 2
        $author_info = [];
352
353
        // First try dig up any information about the author possible
354 2
        if ($author = $item->get_author()) {
355 2
            if ($name = $author->get_name()) {
356
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
357
                // Atom feed, the value can be either full name or username
358
                $author_info['user_or_full'] = $name;
359
            } else {
360 2
                $email = $author->get_email();
361 2
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
362
            }
363
364 2
            if (!preg_match('/[<\(]/', $name)) {
365 1
                $author_info['user_or_full'] = $name;
366
            } else {
367 2
                if (str_contains($name, '<')) {
368
                    // The classic "Full Name <email>" format
369 1
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
370
                } else {
371
                    // The classic "email (Full Name)" format
372 1
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
373
                }
374 2
                if (preg_match($regex, $name, $matches)) {
375 2
                    $author_info['email'] = $matches['email'];
376 2
                    $author_info['user_or_full'] = $matches['fullname'];
377
                }
378
            }
379
        }
380
381 2
        if (isset($author_info['user_or_full'])) {
382 2
            $author_info['user_or_full'] = trim($author_info['user_or_full']);
383 2
            if (str_contains($author_info['user_or_full'], ' ')) {
384
                // This value has a space in it, assuming full name
385 2
                $author_info['full_name'] = $author_info['user_or_full'];
386
            } else {
387 1
                $author_info['username'] = $author_info['user_or_full'];
388
            }
389 2
            unset($author_info['user_or_full']);
390
        }
391
392 2
        return $author_info;
393
    }
394
395
    /**
396
     * Parses author formats used by different feed standards and
397
     * tries to match to persons in database.
398
     */
399 2
    public function match_item_author(net_nemein_rss_parser_item $item) : ?midcom_db_person
400
    {
401
        // Parse the item for author information
402 2
        $author_info = self::parse_item_author($item);
403
404 2
        if (!empty($author_info['email'])) {
405
            // Email is a pretty good identifier, start with it
406 2
            $person_qb = midcom_db_person::new_query_builder();
407 2
            $person_qb->add_constraint('email', '=', $author_info['email']);
408 2
            if ($persons = $person_qb->execute()) {
409 1
                return $persons[0];
410
            }
411
        }
412
413 2
        if (   !empty($author_info['username'])
414 2
            && $person = midcom::get()->auth->get_user_by_name($author_info['username'])) {
415 1
            return $person->get_storage();
416
        }
417
418 2
        if (!empty($author_info['full_name'])) {
419 2
            $name_parts = explode(' ', $author_info['full_name']);
420 2
            if (count($name_parts) > 1) {
421
                // We assume the western format Firstname Lastname
422 2
                [$firstname, $lastname] = $name_parts;
423
424 2
                $person_qb = midcom_db_person::new_query_builder();
425 2
                $person_qb->add_constraint('firstname', '=', $firstname);
426 2
                $person_qb->add_constraint('lastname', '=', $lastname);
427 2
                if ($persons = $person_qb->execute()) {
428 1
                    return $persons[0];
429
                }
430
            }
431
        }
432
433 1
        return null;
434
    }
435
436
    /**
437
     * Parses additional metadata in RSS item and sets parameters accordingly
438
     */
439 1
    private function _parse_parameters(midcom_db_article $article, net_nemein_rss_parser_item $item)
440
    {
441 1
        foreach ($item->get_enclosures() as $enclosure) {
442 1
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
443 1
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
444 1
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
445
        }
446
    }
447
448
    /**
449
     * Parses rel-tag links in article content and tags the object based on them
450
     */
451 1
    private function _parse_tags(midcom_db_article $article)
452
    {
453 1
        $crawler = new Crawler($article->content);
454 1
        $nodes = $crawler->filter('a[rel="tag"]');
455
456 1
        $html_tags = $nodes->each(function(Crawler $node) {
457 1
            return [
458 1
                'href' => $node->attr('href') ?? false,
459 1
                'value' => $node->text() ?? false,
460 1
            ];
461 1
        });
462
463 1
        $tags = [];
464
465 1
        foreach ($html_tags as $html_tag) {
466 1
            if (!$html_tag['value']) {
467
                // No actual tag specified, skip
468
                continue;
469
            }
470
471 1
            $tag = strtolower(strip_tags($html_tag['value']));
472 1
            $tags[$tag] = $html_tag['href'];
473
        }
474 1
        if (!empty($tags)) {
475 1
            net_nemein_tag_handler::tag_object($article, $tags, $this->_node->component);
476
        }
477
    }
478
}
479