Passed
Push — master ( 33e64a...a4094b )
by Andreas
10:57
created

net_nemein_rss_fetch::raw_fetch()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 4
nc 1
nop 1
dl 0
loc 6
ccs 0
cts 5
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
use Symfony\Component\DomCrawler\Crawler;
10
use midcom\dba\softdelete;
11
use SimplePie\SimplePie;
12
13
/**
14
 * RSS and Atom feed fetching class. Caches the fetched items as articles
15
 * in net.nehmer.blog
16
 *
17
 * @package net.nemein.rss
18
 */
19
class net_nemein_rss_fetch
20
{
21
    /**
22
     * The last error reported by SimplePie, if any
23
     */
24
    public $lasterror;
25
26
    private net_nemein_rss_feed_dba $_feed;
27
28
    /**
29
     * Property of midcom_db_article we're using for storing the feed item GUIDs
30
     */
31
    private string $_guid_property = 'extra2';
32
33
    /**
34
     * Current node we're importing to
35
     */
36
    private midcom_db_topic $_node;
37
38
    /**
39
     * Initializes the class with a given feed
40
     */
41 2
    public function __construct(net_nemein_rss_feed_dba $feed)
42
    {
43 2
        $this->_feed = $feed;
44 2
        $this->_node = new midcom_db_topic($feed->node);
45
    }
46
47 2
    public static function get_parser() : SimplePie
48
    {
49 2
        $parser = new SimplePie;
50 2
        $parser->get_registry()->register('Item', net_nemein_rss_parser_item::class);
51 2
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
52 2
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
53 2
        return $parser;
54
    }
55
56
    /**
57
     * Actually fetch a feed
58
     */
59
    public static function raw_fetch(string $url) : SimplePie
60
    {
61
        $parser = self::get_parser();
62
        $parser->set_feed_url($url);
63
        $parser->init();
64
        return $parser;
65
    }
66
67
    /**
68
     * Fetch given RSS or Atom feed
69
     *
70
     * @return net_nemein_rss_parser_item[] Array of normalized feed items
71
     */
72
    function fetch() : array
73
    {
74
        $parser = self::raw_fetch($this->_feed->url);
75
        if ($parser->error()) {
76
            $this->lasterror = $parser->error();
77
            return [];
78
        }
79
        if (!empty($parser->data['headers']['etag'])) {
80
            // Etag checking
81
            $etag = trim($parser->data['headers']['etag']);
82
83
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
84
            if (   !empty($feed_etag)
85
                && $feed_etag == $etag) {
86
                // Feed hasn't changed, skip updating
87
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
88
                return [];
89
            }
90
91
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
92
        }
93
94
        $this->_feed->latestfetch = time();
95
        $this->_feed->_use_rcs = false;
96
        $this->_feed->update();
97
98
        return $parser->get_items();
99
    }
100
101
    /**
102
     * Fetches and imports items in the feed
103
     */
104
    public function import() : array
105
    {
106
        if (!$this->_node->component) {
107
            return [];
108
        }
109
110
        $items = $this->fetch();
111
112
        if (empty($items)) {
113
            // This feed didn't return any items, skip
114
            return [];
115
        }
116
117
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
118
        $items = array_reverse($items);
119
120
        foreach ($items as $item) {
121
            if ($guid = $this->import_item($item)) {
122
                $item->set_local_guid($guid);
123
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
124
            } else {
125
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
126
            }
127
        }
128
129
        $this->clean($items);
130
131
        return array_reverse($items);
132
    }
133
134
    /**
135
     * Imports a feed item into the database
136
     */
137 1
    public function import_item(net_nemein_rss_parser_item $item) : ?string
138
    {
139 1
        if ($this->_node->component !== 'net.nehmer.blog') {
140
            throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
141
        }
142 1
        return $this->import_article($item);
143
    }
144
145
    /**
146
     * Imports an item as a news article
147
     */
148 1
    private function import_article(net_nemein_rss_parser_item $item) : ?string
149
    {
150 1
        $guid = $item->get_id();
151 1
        $title = $item->get_title();
152
153 1
        if (   (   empty($title)
154 1
                || trim($title) == '...')
155 1
            && empty($guid)) {
156
            // Something wrong with this entry, skip it
157
            return null;
158
        }
159
160 1
        $article = $this->find_article($item, $guid);
161 1
        if (!$article) {
162
            return null;
163
        }
164
165 1
        $article->allow_name_catenate = true;
166 1
        $article->set_rcs_message(sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title));
167
168 1
        $values = [
169 1
            'title' => $title,
170 1
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
171 1
            'content' => $item->get_content(),
172 1
            'url' => $item->get_link(),
173 1
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
174 1
        ];
175 1
        $meta_values = [];
176
177
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
178 1
        if (empty($article->name)) {
179 1
            $values['name'] = midcom_helper_misc::urlize($title);
180
        }
181
182 1
        $categories = $item->get_categories();
183 1
        if (is_array($categories)) {
184
            // Handle categories provided in the feed
185 1
            foreach ($categories as $category) {
186
                // Clean up the categories and save
187 1
                $category = str_replace('|', '_', trim($category->get_term()));
188 1
                $values['extra1'] .= "{$category}|";
189
            }
190
        }
191
192 1
        $article_author = $this->find_author($item);
193 1
        if (!empty($article_author->guid)) {
194 1
            $meta_values['authors'] = "|{$article_author->guid}|";
195
        }
196
197
        // Try to figure out item publication date
198 1
        if ($article_date = $item->get_date('U')) {
199 1
            $meta_values['published'] = $article_date;
200
        } else {
201
            $article_date = time();
202
            if (!$article->id) {
203
                $meta_values['published'] = $article_date;
204
            }
205
        }
206
207 1
        if ($article_date > $this->_feed->latestupdate) {
208
            // Cache "latest updated" time to feed
209 1
            $this->_feed->latestupdate = $article_date;
210 1
            $this->_feed->_use_rcs = false;
211 1
            $this->_feed->update();
212
        }
213
214 1
        if ($article->id) {
215 1
            if (   $this->apply_values($article, $values, $meta_values)
216 1
                && !$article->update()) {
217 1
                return null;
218
            }
219
        } else {
220 1
            $this->apply_values($article, $values, $meta_values);
221 1
            if (!$article->create()) {
222
                return null;
223
            }
224
        }
225
226 1
        if ($this->_feed->autoapprove) {
227
            $article->metadata->approve();
228
        }
229
230 1
        $this->_parse_tags($article);
231 1
        $this->_parse_parameters($article, $item);
232
233
        // store <link rel="replies"> url in parameter
234 1
        if ($item->get_link(0, 'replies')) {
235
            $article->set_parameter('net.nemein.rss', 'replies_url', $item->get_link(0, 'replies'));
236
        }
237
238 1
        return $article->guid;
239
    }
240
241 1
    private function find_author(net_nemein_rss_parser_item $item) : midcom_db_person
242
    {
243
        // Try to figure out item author
244 1
        if (   $this->_feed->forceauthor
245 1
            && $this->_feed->defaultauthor) {
246
            // Feed has a "default author" set, use it
247
            return new midcom_db_person($this->_feed->defaultauthor);
248
        }
249 1
        $author = $this->match_item_author($item);
250 1
        $fallback_person_id = 1;
251 1
        if (   !$author
252 1
            || $author->id == $fallback_person_id) {
253 1
            if ($this->_feed->defaultauthor) {
254
                // Feed has a "default author" set, use it
255
                $author = new midcom_db_person($this->_feed->defaultauthor);
256
            } else {
257
                // Fall back to "Midgard Admin" just in case
258 1
                $author = new midcom_db_person($fallback_person_id);
259
            }
260
        }
261 1
        return $author;
262
    }
263
264 1
    private function find_article(net_nemein_rss_parser_item $item, string $guid) : ?midcom_db_article
265
    {
266 1
        $qb = midcom_db_article::new_query_builder();
267 1
        $qb->add_constraint('topic', '=', $this->_feed->node);
268 1
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
269 1
        $articles = $qb->execute();
270 1
        if (!empty($articles)) {
271
            // This item has been imported already earlier. Update
272 1
            return $articles[0];
273
        }
274
275
        // Check against duplicate hits that may come from different feeds
276 1
        if ($link = $item->get_link()) {
277 1
            $qb = midcom_db_article::new_query_builder();
278 1
            $qb->add_constraint('topic', '=', $this->_feed->node);
279 1
            $qb->add_constraint('url', '=', $link);
280 1
            if ($qb->count() > 0) {
281
                // Dupe, skip
282
                return null;
283
            }
284
        }
285
286
        // This is a new item
287 1
        $article = new midcom_db_article();
288 1
        $article->topic = $this->_feed->node;
289 1
        return $article;
290
    }
291
292 1
    private function apply_values(midcom_db_article $article, array $values, array $meta_values) : bool
293
    {
294 1
        $updated = false;
295
296 1
        foreach ($values as $fieldname => $value) {
297 1
            if ($article->$fieldname !== $value) {
298 1
                $article->$fieldname = $value;
299 1
                $updated = true;
300
            }
301
        }
302
303 1
        foreach ($meta_values as $fieldname => $value) {
304 1
            if ($article->metadata->$fieldname !== $value) {
305 1
                $article->metadata->$fieldname = $value;
306 1
                $updated = true;
307
            }
308
        }
309
310 1
        return $updated;
311
    }
312
313
    /**
314
     * Cleans up old, removed items from feeds
315
     *
316
     * @param net_nemein_rss_parser_item[] $items
317
     */
318
    private function clean(array $items)
319
    {
320
        if ($this->_feed->keepremoved) {
321
            // This feed is set up so that we retain items removed from array
322
            return;
323
        }
324
325
        // Create array of item GUIDs
326
        $item_guids = [];
327
        foreach ($items as $item) {
328
            $item_guids[] = $item->get_id();
329
        }
330
331
        // Find articles resulting from this feed
332
        $qb = midcom_db_article::new_query_builder();
333
        $feed_category = md5($this->_feed->url);
334
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
335
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
336
        $local_items = $qb->execute_unchecked();
337
        $purge_guids = [];
338
        foreach ($local_items as $item) {
339
            $purge_guids[] = $item->guid;
340
            $item->delete();
341
        }
342
343
        softdelete::purge($purge_guids, 'midgard_article');
344
    }
345
346
    /**
347
     * Parses author formats used by different feed standards and
348
     * and returns the information
349
     */
350 2
    public static function parse_item_author(net_nemein_rss_parser_item $item) : array
351
    {
352 2
        $author_info = [];
353
354
        // First try dig up any information about the author possible
355 2
        if ($author = $item->get_author()) {
356 2
            $name = $author->get_name();
357 2
            $email = $author->get_email();
358 2
            if (!empty($name)) {
359
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
360
                // Atom feed, the value can be either full name or username
361
                $author_info['user_or_full'] = $name;
362
            } else {
363 2
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
364
            }
365
366 2
            if (!preg_match('/[<\(]/', $name)) {
367 1
                $author_info['user_or_full'] = $name;
368
            } else {
369 2
                if (str_contains($name, '<')) {
370
                    // The classic "Full Name <email>" format
371 1
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
372
                } else {
373
                    // The classic "email (Full Name)" format
374 1
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
375
                }
376 2
                if (preg_match($regex, $name, $matches)) {
377 2
                    $author_info['email'] = $matches['email'];
378 2
                    $author_info['user_or_full'] = $matches['fullname'];
379
                }
380
            }
381
        }
382
383 2
        if (isset($author_info['user_or_full'])) {
384 2
            $author_info['user_or_full'] = trim($author_info['user_or_full']);
385 2
            if (str_contains($author_info['user_or_full'], ' ')) {
386
                // This value has a space in it, assuming full name
387 2
                $author_info['full_name'] = $author_info['user_or_full'];
388
            } else {
389 1
                $author_info['username'] = $author_info['user_or_full'];
390
            }
391 2
            unset($author_info['user_or_full']);
392
        }
393
394 2
        return $author_info;
395
    }
396
397
    /**
398
     * Parses author formats used by different feed standards and
399
     * tries to match to persons in database.
400
     */
401 2
    public function match_item_author(net_nemein_rss_parser_item $item) : ?midcom_db_person
402
    {
403
        // Parse the item for author information
404 2
        $author_info = self::parse_item_author($item);
405
406 2
        if (!empty($author_info['email'])) {
407
            // Email is a pretty good identifier, start with it
408 2
            $person_qb = midcom_db_person::new_query_builder();
409 2
            $person_qb->add_constraint('email', '=', $author_info['email']);
410 2
            $persons = $person_qb->execute();
411 2
            if (!empty($persons)) {
412 1
                return $persons[0];
413
            }
414
        }
415
416 2
        if (   !empty($author_info['username'])
417 2
            && $person = midcom::get()->auth->get_user_by_name($author_info['username'])) {
418 1
            return $person->get_storage();
419
        }
420
421 2
        if (!empty($author_info['full_name'])) {
422 2
            $name_parts = explode(' ', $author_info['full_name']);
423 2
            if (count($name_parts) > 1) {
424
                // We assume the western format Firstname Lastname
425 2
                $firstname = $name_parts[0];
426 2
                $lastname = $name_parts[1];
427
428 2
                $person_qb = midcom_db_person::new_query_builder();
429 2
                $person_qb->add_constraint('firstname', '=', $firstname);
430 2
                $person_qb->add_constraint('lastname', '=', $lastname);
431 2
                $persons = $person_qb->execute();
432 2
                if (!empty($persons)) {
433 1
                    return $persons[0];
434
                }
435
            }
436
        }
437
438 1
        return null;
439
    }
440
441
    /**
442
     * Parses additional metadata in RSS item and sets parameters accordingly
443
     */
444 1
    private function _parse_parameters(midcom_db_article $article, net_nemein_rss_parser_item $item)
445
    {
446 1
        foreach ($item->get_enclosures() as $enclosure) {
447 1
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
448 1
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
449 1
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
450
        }
451
    }
452
453
    /**
454
     * Parses rel-tag links in article content and tags the object based on them
455
     */
456 1
    private function _parse_tags(midcom_db_article $article)
457
    {
458 1
        $crawler = new Crawler($article->content);
459 1
        $nodes = $crawler->filter('a[rel="tag"]');
460
461 1
        $html_tags = $nodes->each(function(Crawler $node) {
462 1
            return [
463 1
                'href' => $node->attr('href') ?? false,
464 1
                'value' => $node->text() ?? false,
465 1
            ];
466 1
        });
467
468 1
        $tags = [];
469
470 1
        foreach ($html_tags as $html_tag) {
471 1
            if (!$html_tag['value']) {
472
                // No actual tag specified, skip
473
                continue;
474
            }
475
476 1
            $tag = strtolower(strip_tags($html_tag['value']));
477 1
            $tags[$tag] = $html_tag['href'];
478
        }
479 1
        if (!empty($tags)) {
480 1
            net_nemein_tag_handler::tag_object($article, $tags, $this->_node->component);
481
        }
482
    }
483
}
484