Passed
Push — master ( 33e64a...a4094b )
by Andreas
10:57
created

net_nemein_rss_fetch::import_article()   F

Complexity

Conditions 18
Paths 482

Size

Total Lines 91
Code Lines 53

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 45
CRAP Score 19.1133

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 18
eloc 53
c 1
b 0
f 0
nc 482
nop 1
dl 0
loc 91
ccs 45
cts 53
cp 0.8491
crap 19.1133
rs 1.4194

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
use Symfony\Component\DomCrawler\Crawler;
10
use midcom\dba\softdelete;
11
use SimplePie\SimplePie;
12
13
/**
14
 * RSS and Atom feed fetching class. Caches the fetched items as articles
15
 * in net.nehmer.blog
16
 *
17
 * @package net.nemein.rss
18
 */
19
class net_nemein_rss_fetch
20
{
21
    /**
22
     * The last error reported by SimplePie, if any
23
     */
24
    public $lasterror;
25
26
    private net_nemein_rss_feed_dba $_feed;
27
28
    /**
29
     * Property of midcom_db_article we're using for storing the feed item GUIDs
30
     */
31
    private string $_guid_property = 'extra2';
32
33
    /**
34
     * Current node we're importing to
35
     */
36
    private midcom_db_topic $_node;
37
38
    /**
39
     * Initializes the class with a given feed
40
     */
41 2
    public function __construct(net_nemein_rss_feed_dba $feed)
42
    {
43 2
        $this->_feed = $feed;
44 2
        $this->_node = new midcom_db_topic($feed->node);
45
    }
46
47 2
    public static function get_parser() : SimplePie
48
    {
49 2
        $parser = new SimplePie;
50 2
        $parser->get_registry()->register('Item', net_nemein_rss_parser_item::class);
51 2
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
52 2
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
53 2
        return $parser;
54
    }
55
56
    /**
57
     * Actually fetch a feed
58
     */
59
    public static function raw_fetch(string $url) : SimplePie
60
    {
61
        $parser = self::get_parser();
62
        $parser->set_feed_url($url);
63
        $parser->init();
64
        return $parser;
65
    }
66
67
    /**
68
     * Fetch given RSS or Atom feed
69
     *
70
     * @return net_nemein_rss_parser_item[] Array of normalized feed items
71
     */
72
    function fetch() : array
73
    {
74
        $parser = self::raw_fetch($this->_feed->url);
75
        if ($parser->error()) {
76
            $this->lasterror = $parser->error();
77
            return [];
78
        }
79
        if (!empty($parser->data['headers']['etag'])) {
80
            // Etag checking
81
            $etag = trim($parser->data['headers']['etag']);
82
83
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
84
            if (   !empty($feed_etag)
85
                && $feed_etag == $etag) {
86
                // Feed hasn't changed, skip updating
87
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
88
                return [];
89
            }
90
91
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
92
        }
93
94
        $this->_feed->latestfetch = time();
95
        $this->_feed->_use_rcs = false;
96
        $this->_feed->update();
97
98
        return $parser->get_items();
99
    }
100
101
    /**
102
     * Fetches and imports items in the feed
103
     */
104
    public function import() : array
105
    {
106
        if (!$this->_node->component) {
107
            return [];
108
        }
109
110
        $items = $this->fetch();
111
112
        if (empty($items)) {
113
            // This feed didn't return any items, skip
114
            return [];
115
        }
116
117
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
118
        $items = array_reverse($items);
119
120
        foreach ($items as $item) {
121
            if ($guid = $this->import_item($item)) {
122
                $item->set_local_guid($guid);
123
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
124
            } else {
125
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
126
            }
127
        }
128
129
        $this->clean($items);
130
131
        return array_reverse($items);
132
    }
133
134
    /**
135
     * Imports a feed item into the database
136
     */
137 1
    public function import_item(net_nemein_rss_parser_item $item) : ?string
138
    {
139 1
        if ($this->_node->component !== 'net.nehmer.blog') {
140
            throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
141
        }
142 1
        return $this->import_article($item);
143
    }
144
145
    /**
146
     * Imports an item as a news article
147
     */
148 1
    private function import_article(net_nemein_rss_parser_item $item) : ?string
149
    {
150 1
        $guid = $item->get_id();
151 1
        $title = $item->get_title();
152
153 1
        if (   (   empty($title)
154 1
                || trim($title) == '...')
155 1
            && empty($guid)) {
156
            // Something wrong with this entry, skip it
157
            return null;
158
        }
159
160 1
        $article = $this->find_article($item, $guid);
161 1
        if (!$article) {
162
            return null;
163
        }
164
165 1
        $article->allow_name_catenate = true;
166 1
        $article->set_rcs_message(sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title));
167
168 1
        $values = [
169 1
            'title' => $title,
170 1
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
171 1
            'content' => $item->get_content(),
172 1
            'url' => $item->get_link(),
173 1
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
174 1
        ];
175 1
        $meta_values = [];
176
177
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
178 1
        if (empty($article->name)) {
179 1
            $values['name'] = midcom_helper_misc::urlize($title);
180
        }
181
182 1
        $categories = $item->get_categories();
183 1
        if (is_array($categories)) {
184
            // Handle categories provided in the feed
185 1
            foreach ($categories as $category) {
186
                // Clean up the categories and save
187 1
                $category = str_replace('|', '_', trim($category->get_term()));
188 1
                $values['extra1'] .= "{$category}|";
189
            }
190
        }
191
192 1
        $article_author = $this->find_author($item);
193 1
        if (!empty($article_author->guid)) {
194 1
            $meta_values['authors'] = "|{$article_author->guid}|";
195
        }
196
197
        // Try to figure out item publication date
198 1
        if ($article_date = $item->get_date('U')) {
199 1
            $meta_values['published'] = $article_date;
200
        } else {
201
            $article_date = time();
202
            if (!$article->id) {
203
                $meta_values['published'] = $article_date;
204
            }
205
        }
206
207 1
        if ($article_date > $this->_feed->latestupdate) {
208
            // Cache "latest updated" time to feed
209 1
            $this->_feed->latestupdate = $article_date;
210 1
            $this->_feed->_use_rcs = false;
211 1
            $this->_feed->update();
212
        }
213
214 1
        if ($article->id) {
215 1
            if (   $this->apply_values($article, $values, $meta_values)
216 1
                && !$article->update()) {
217 1
                return null;
218
            }
219
        } else {
220 1
            $this->apply_values($article, $values, $meta_values);
221 1
            if (!$article->create()) {
222
                return null;
223
            }
224
        }
225
226 1
        if ($this->_feed->autoapprove) {
227
            $article->metadata->approve();
228
        }
229
230 1
        $this->_parse_tags($article);
231 1
        $this->_parse_parameters($article, $item);
232
233
        // store <link rel="replies"> url in parameter
234 1
        if ($item->get_link(0, 'replies')) {
235
            $article->set_parameter('net.nemein.rss', 'replies_url', $item->get_link(0, 'replies'));
236
        }
237
238 1
        return $article->guid;
239
    }
240
241 1
    private function find_author(net_nemein_rss_parser_item $item) : midcom_db_person
242
    {
243
        // Try to figure out item author
244 1
        if (   $this->_feed->forceauthor
245 1
            && $this->_feed->defaultauthor) {
246
            // Feed has a "default author" set, use it
247
            return new midcom_db_person($this->_feed->defaultauthor);
248
        }
249 1
        $author = $this->match_item_author($item);
250 1
        $fallback_person_id = 1;
251 1
        if (   !$author
252 1
            || $author->id == $fallback_person_id) {
253 1
            if ($this->_feed->defaultauthor) {
254
                // Feed has a "default author" set, use it
255
                $author = new midcom_db_person($this->_feed->defaultauthor);
256
            } else {
257
                // Fall back to "Midgard Admin" just in case
258 1
                $author = new midcom_db_person($fallback_person_id);
259
            }
260
        }
261 1
        return $author;
262
    }
263
264 1
    private function find_article(net_nemein_rss_parser_item $item, string $guid) : ?midcom_db_article
265
    {
266 1
        $qb = midcom_db_article::new_query_builder();
267 1
        $qb->add_constraint('topic', '=', $this->_feed->node);
268 1
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
269 1
        $articles = $qb->execute();
270 1
        if (!empty($articles)) {
271
            // This item has been imported already earlier. Update
272 1
            return $articles[0];
273
        }
274
275
        // Check against duplicate hits that may come from different feeds
276 1
        if ($link = $item->get_link()) {
277 1
            $qb = midcom_db_article::new_query_builder();
278 1
            $qb->add_constraint('topic', '=', $this->_feed->node);
279 1
            $qb->add_constraint('url', '=', $link);
280 1
            if ($qb->count() > 0) {
281
                // Dupe, skip
282
                return null;
283
            }
284
        }
285
286
        // This is a new item
287 1
        $article = new midcom_db_article();
288 1
        $article->topic = $this->_feed->node;
289 1
        return $article;
290
    }
291
292 1
    private function apply_values(midcom_db_article $article, array $values, array $meta_values) : bool
293
    {
294 1
        $updated = false;
295
296 1
        foreach ($values as $fieldname => $value) {
297 1
            if ($article->$fieldname !== $value) {
298 1
                $article->$fieldname = $value;
299 1
                $updated = true;
300
            }
301
        }
302
303 1
        foreach ($meta_values as $fieldname => $value) {
304 1
            if ($article->metadata->$fieldname !== $value) {
305 1
                $article->metadata->$fieldname = $value;
306 1
                $updated = true;
307
            }
308
        }
309
310 1
        return $updated;
311
    }
312
313
    /**
314
     * Cleans up old, removed items from feeds
315
     *
316
     * @param net_nemein_rss_parser_item[] $items
317
     */
318
    private function clean(array $items)
319
    {
320
        if ($this->_feed->keepremoved) {
321
            // This feed is set up so that we retain items removed from array
322
            return;
323
        }
324
325
        // Create array of item GUIDs
326
        $item_guids = [];
327
        foreach ($items as $item) {
328
            $item_guids[] = $item->get_id();
329
        }
330
331
        // Find articles resulting from this feed
332
        $qb = midcom_db_article::new_query_builder();
333
        $feed_category = md5($this->_feed->url);
334
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
335
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
336
        $local_items = $qb->execute_unchecked();
337
        $purge_guids = [];
338
        foreach ($local_items as $item) {
339
            $purge_guids[] = $item->guid;
340
            $item->delete();
341
        }
342
343
        softdelete::purge($purge_guids, 'midgard_article');
344
    }
345
346
    /**
347
     * Parses author formats used by different feed standards and
348
     * and returns the information
349
     */
350 2
    public static function parse_item_author(net_nemein_rss_parser_item $item) : array
351
    {
352 2
        $author_info = [];
353
354
        // First try dig up any information about the author possible
355 2
        if ($author = $item->get_author()) {
356 2
            $name = $author->get_name();
357 2
            $email = $author->get_email();
358 2
            if (!empty($name)) {
359
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
360
                // Atom feed, the value can be either full name or username
361
                $author_info['user_or_full'] = $name;
362
            } else {
363 2
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
364
            }
365
366 2
            if (!preg_match('/[<\(]/', $name)) {
367 1
                $author_info['user_or_full'] = $name;
368
            } else {
369 2
                if (str_contains($name, '<')) {
370
                    // The classic "Full Name <email>" format
371 1
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
372
                } else {
373
                    // The classic "email (Full Name)" format
374 1
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
375
                }
376 2
                if (preg_match($regex, $name, $matches)) {
377 2
                    $author_info['email'] = $matches['email'];
378 2
                    $author_info['user_or_full'] = $matches['fullname'];
379
                }
380
            }
381
        }
382
383 2
        if (isset($author_info['user_or_full'])) {
384 2
            $author_info['user_or_full'] = trim($author_info['user_or_full']);
385 2
            if (str_contains($author_info['user_or_full'], ' ')) {
386
                // This value has a space in it, assuming full name
387 2
                $author_info['full_name'] = $author_info['user_or_full'];
388
            } else {
389 1
                $author_info['username'] = $author_info['user_or_full'];
390
            }
391 2
            unset($author_info['user_or_full']);
392
        }
393
394 2
        return $author_info;
395
    }
396
397
    /**
398
     * Parses author formats used by different feed standards and
399
     * tries to match to persons in database.
400
     */
401 2
    public function match_item_author(net_nemein_rss_parser_item $item) : ?midcom_db_person
402
    {
403
        // Parse the item for author information
404 2
        $author_info = self::parse_item_author($item);
405
406 2
        if (!empty($author_info['email'])) {
407
            // Email is a pretty good identifier, start with it
408 2
            $person_qb = midcom_db_person::new_query_builder();
409 2
            $person_qb->add_constraint('email', '=', $author_info['email']);
410 2
            $persons = $person_qb->execute();
411 2
            if (!empty($persons)) {
412 1
                return $persons[0];
413
            }
414
        }
415
416 2
        if (   !empty($author_info['username'])
417 2
            && $person = midcom::get()->auth->get_user_by_name($author_info['username'])) {
418 1
            return $person->get_storage();
419
        }
420
421 2
        if (!empty($author_info['full_name'])) {
422 2
            $name_parts = explode(' ', $author_info['full_name']);
423 2
            if (count($name_parts) > 1) {
424
                // We assume the western format Firstname Lastname
425 2
                $firstname = $name_parts[0];
426 2
                $lastname = $name_parts[1];
427
428 2
                $person_qb = midcom_db_person::new_query_builder();
429 2
                $person_qb->add_constraint('firstname', '=', $firstname);
430 2
                $person_qb->add_constraint('lastname', '=', $lastname);
431 2
                $persons = $person_qb->execute();
432 2
                if (!empty($persons)) {
433 1
                    return $persons[0];
434
                }
435
            }
436
        }
437
438 1
        return null;
439
    }
440
441
    /**
442
     * Parses additional metadata in RSS item and sets parameters accordingly
443
     */
444 1
    private function _parse_parameters(midcom_db_article $article, net_nemein_rss_parser_item $item)
445
    {
446 1
        foreach ($item->get_enclosures() as $enclosure) {
447 1
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
448 1
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
449 1
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
450
        }
451
    }
452
453
    /**
454
     * Parses rel-tag links in article content and tags the object based on them
455
     */
456 1
    private function _parse_tags(midcom_db_article $article)
457
    {
458 1
        $crawler = new Crawler($article->content);
459 1
        $nodes = $crawler->filter('a[rel="tag"]');
460
461 1
        $html_tags = $nodes->each(function(Crawler $node) {
462 1
            return [
463 1
                'href' => $node->attr('href') ?? false,
464 1
                'value' => $node->text() ?? false,
465 1
            ];
466 1
        });
467
468 1
        $tags = [];
469
470 1
        foreach ($html_tags as $html_tag) {
471 1
            if (!$html_tag['value']) {
472
                // No actual tag specified, skip
473
                continue;
474
            }
475
476 1
            $tag = strtolower(strip_tags($html_tag['value']));
477 1
            $tags[$tag] = $html_tag['href'];
478
        }
479 1
        if (!empty($tags)) {
480 1
            net_nemein_tag_handler::tag_object($article, $tags, $this->_node->component);
481
        }
482
    }
483
}
484