net_nemein_rss_fetch::import_article()   F
last analyzed

Complexity

Conditions 18
Paths 482

Size

Total Lines 91
Code Lines 53

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 45
CRAP Score 19.1133

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 18
eloc 53
c 1
b 0
f 0
nc 482
nop 1
dl 0
loc 91
ccs 45
cts 53
cp 0.8491
crap 19.1133
rs 1.4194

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
use Symfony\Component\DomCrawler\Crawler;
10
use midcom\dba\softdelete;
11
use SimplePie\SimplePie;
12
13
/**
14
 * RSS and Atom feed fetching class. Caches the fetched items as articles
15
 * in net.nehmer.blog
16
 *
17
 * @package net.nemein.rss
18
 */
19
class net_nemein_rss_fetch
20
{
21
    /**
22
     * The last error reported by SimplePie, if any
23
     */
24
    public $lasterror;
25
26
    private net_nemein_rss_feed_dba $_feed;
27
28
    /**
29
     * Property of midcom_db_article we're using for storing the feed item GUIDs
30
     */
31
    private string $_guid_property = 'extra2';
32
33
    /**
34
     * Current node we're importing to
35
     */
36
    private midcom_db_topic $_node;
37
38
    /**
39
     * Initializes the class with a given feed
40
     */
41 2
    public function __construct(net_nemein_rss_feed_dba $feed)
42
    {
43 2
        $this->_feed = $feed;
44 2
        $this->_node = new midcom_db_topic($feed->node);
45
    }
46
47 2
    public static function get_parser() : SimplePie
48
    {
49 2
        $parser = new SimplePie;
50 2
        $parser->get_registry()->register('Item', net_nemein_rss_parser_item::class);
51 2
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
52 2
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
53 2
        return $parser;
54
    }
55
56
    /**
57
     * Actually fetch a feed
58
     */
59
    public static function raw_fetch(string $url) : SimplePie
60
    {
61
        $parser = self::get_parser();
62
        $parser->set_feed_url($url);
63
        $parser->init();
64
        return $parser;
65
    }
66
67
    /**
68
     * Fetch given RSS or Atom feed
69
     *
70
     * @return net_nemein_rss_parser_item[] Array of normalized feed items
71
     */
72
    private function fetch() : array
73
    {
74
        $parser = self::raw_fetch($this->_feed->url);
75
        if ($parser->error()) {
76
            $this->lasterror = $parser->error();
77
            return [];
78
        }
79
        if (!empty($parser->data['headers']['etag'])) {
80
            // Etag checking
81
            $etag = trim($parser->data['headers']['etag']);
82
83
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
84
            if (   !empty($feed_etag)
85
                && $feed_etag == $etag) {
86
                // Feed hasn't changed, skip updating
87
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
88
                return [];
89
            }
90
91
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
92
        }
93
94
        $this->_feed->latestfetch = time();
95
        $this->_feed->_use_rcs = false;
96
        $this->_feed->update();
97
98
        return $parser->get_items();
99
    }
100
101
    /**
102
     * Fetches and imports items in the feed
103
     */
104
    public function import() : array
105
    {
106
        if (!$this->_node->component) {
107
            return [];
108
        }
109
110
        $items = $this->fetch();
111
112
        if (empty($items)) {
113
            // This feed didn't return any items, skip
114
            return [];
115
        }
116
117
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
118
        $items = array_reverse($items);
119
120
        foreach ($items as $item) {
121
            if ($guid = $this->import_item($item)) {
122
                $item->set_local_guid($guid);
123
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
124
            } else {
125
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
126
            }
127
        }
128
129
        $this->clean($items);
130
131
        return array_reverse($items);
132
    }
133
134
    /**
135
     * Imports a feed item into the database
136
     */
137 1
    public function import_item(net_nemein_rss_parser_item $item) : ?string
138
    {
139 1
        if ($this->_node->component !== 'net.nehmer.blog') {
140
            throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
141
        }
142 1
        return $this->import_article($item);
143
    }
144
145
    /**
146
     * Imports an item as a news article
147
     */
148 1
    private function import_article(net_nemein_rss_parser_item $item) : ?string
149
    {
150 1
        $guid = $item->get_id();
151 1
        $title = $item->get_title();
152
153 1
        if (   (   empty($title)
154 1
                || trim($title) == '...')
155 1
            && empty($guid)) {
156
            // Something wrong with this entry, skip it
157
            return null;
158
        }
159
160 1
        $article = $this->find_article($item, $guid);
161 1
        if (!$article) {
162
            return null;
163
        }
164
165 1
        $article->allow_name_catenate = true;
166 1
        $article->set_rcs_message(sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title));
167
168 1
        $values = [
169 1
            'title' => $title,
170 1
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
171 1
            'content' => $item->get_content(),
172 1
            'url' => $item->get_link(),
173 1
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
174 1
        ];
175 1
        $meta_values = [];
176
177
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
178 1
        if (empty($article->name)) {
179 1
            $values['name'] = midcom_helper_misc::urlize($title);
180
        }
181
182 1
        $categories = $item->get_categories();
183 1
        if (is_array($categories)) {
184
            // Handle categories provided in the feed
185 1
            foreach ($categories as $category) {
186
                // Clean up the categories and save
187 1
                $category = str_replace('|', '_', trim($category->get_term()));
188 1
                $values['extra1'] .= "{$category}|";
189
            }
190
        }
191
192 1
        $article_author = $this->find_author($item);
193 1
        if (!empty($article_author->guid)) {
194 1
            $meta_values['authors'] = "|{$article_author->guid}|";
195
        }
196
197
        // Try to figure out item publication date
198 1
        if ($article_date = $item->get_date('U')) {
199 1
            $meta_values['published'] = $article_date;
200
        } else {
201
            $article_date = time();
202
            if (!$article->id) {
203
                $meta_values['published'] = $article_date;
204
            }
205
        }
206
207 1
        if ($article_date > $this->_feed->latestupdate) {
208
            // Cache "latest updated" time to feed
209 1
            $this->_feed->latestupdate = $article_date;
210 1
            $this->_feed->_use_rcs = false;
211 1
            $this->_feed->update();
212
        }
213
214 1
        if ($article->id) {
215 1
            if (   $this->apply_values($article, $values, $meta_values)
216 1
                && !$article->update()) {
217 1
                return null;
218
            }
219
        } else {
220 1
            $this->apply_values($article, $values, $meta_values);
221 1
            if (!$article->create()) {
222
                return null;
223
            }
224
        }
225
226 1
        if ($this->_feed->autoapprove) {
227
            $article->metadata->approve();
228
        }
229
230 1
        $this->_parse_tags($article);
231 1
        $this->_parse_parameters($article, $item);
232
233
        // store <link rel="replies"> url in parameter
234 1
        if ($link = $item->get_link(rel: 'replies')) {
235
            $article->set_parameter('net.nemein.rss', 'replies_url', $link);
236
        }
237
238 1
        return $article->guid;
239
    }
240
241 1
    private function find_author(net_nemein_rss_parser_item $item) : midcom_db_person
242
    {
243
        // Try to figure out item author
244 1
        if (   $this->_feed->forceauthor
245 1
            && $this->_feed->defaultauthor) {
246
            // Feed has a "default author" set, use it
247
            return new midcom_db_person($this->_feed->defaultauthor);
248
        }
249 1
        $author = $this->match_item_author($item);
250 1
        $fallback_person_id = 1;
251 1
        if (   !$author
252 1
            || $author->id == $fallback_person_id) {
253 1
            if ($this->_feed->defaultauthor) {
254
                // Feed has a "default author" set, use it
255
                $author = new midcom_db_person($this->_feed->defaultauthor);
256
            } else {
257
                // Fall back to "Midgard Admin" just in case
258 1
                $author = new midcom_db_person($fallback_person_id);
259
            }
260
        }
261 1
        return $author;
262
    }
263
264 1
    private function find_article(net_nemein_rss_parser_item $item, string $guid) : ?midcom_db_article
265
    {
266 1
        $qb = midcom_db_article::new_query_builder();
267 1
        $qb->add_constraint('topic', '=', $this->_feed->node);
268 1
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
269 1
        if ($articles = $qb->execute()) {
270
            // This item has been imported already earlier. Update
271 1
            return $articles[0];
272
        }
273
274
        // Check against duplicate hits that may come from different feeds
275 1
        if ($link = $item->get_link()) {
276 1
            $qb = midcom_db_article::new_query_builder();
277 1
            $qb->add_constraint('topic', '=', $this->_feed->node);
278 1
            $qb->add_constraint('url', '=', $link);
279 1
            if ($qb->count() > 0) {
280
                // Dupe, skip
281
                return null;
282
            }
283
        }
284
285
        // This is a new item
286 1
        $article = new midcom_db_article();
287 1
        $article->topic = $this->_feed->node;
288 1
        return $article;
289
    }
290
291 1
    private function apply_values(midcom_db_article $article, array $values, array $meta_values) : bool
292
    {
293 1
        $updated = false;
294
295 1
        foreach ($values as $fieldname => $value) {
296 1
            if ($article->$fieldname !== $value) {
297 1
                $article->$fieldname = $value;
298 1
                $updated = true;
299
            }
300
        }
301
302 1
        foreach ($meta_values as $fieldname => $value) {
303 1
            if ($article->metadata->$fieldname !== $value) {
304 1
                $article->metadata->$fieldname = $value;
305 1
                $updated = true;
306
            }
307
        }
308
309 1
        return $updated;
310
    }
311
312
    /**
313
     * Cleans up old, removed items from feeds
314
     *
315
     * @param net_nemein_rss_parser_item[] $items
316
     */
317
    private function clean(array $items)
318
    {
319
        if ($this->_feed->keepremoved) {
320
            // This feed is set up so that we retain items removed from array
321
            return;
322
        }
323
324
        // Create array of item GUIDs
325
        $item_guids = [];
326
        foreach ($items as $item) {
327
            $item_guids[] = $item->get_id();
328
        }
329
330
        // Find articles resulting from this feed
331
        $qb = midcom_db_article::new_query_builder();
332
        $feed_category = md5($this->_feed->url);
333
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
334
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
335
        $local_items = $qb->execute_unchecked();
336
        $purge_guids = [];
337
        foreach ($local_items as $item) {
338
            $purge_guids[] = $item->guid;
339
            $item->delete();
340
        }
341
342
        softdelete::purge($purge_guids, 'midgard_article');
343
    }
344
345
    /**
346
     * Parses author formats used by different feed standards and
347
     * and returns the information
348
     */
349 2
    public static function parse_item_author(net_nemein_rss_parser_item $item) : array
350
    {
351 2
        $author_info = [];
352
353
        // First try dig up any information about the author possible
354 2
        if ($author = $item->get_author()) {
355 2
            if ($name = $author->get_name()) {
356
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
357
                // Atom feed, the value can be either full name or username
358
                $author_info['user_or_full'] = $name;
359
            } else {
360 2
                $email = $author->get_email();
361 2
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
362
            }
363
364 2
            if (!preg_match('/[<\(]/', $name)) {
365 1
                $author_info['user_or_full'] = $name;
366
            } else {
367 2
                if (str_contains($name, '<')) {
368
                    // The classic "Full Name <email>" format
369 1
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
370
                } else {
371
                    // The classic "email (Full Name)" format
372 1
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
373
                }
374 2
                if (preg_match($regex, $name, $matches)) {
375 2
                    $author_info['email'] = $matches['email'];
376 2
                    $author_info['user_or_full'] = $matches['fullname'];
377
                }
378
            }
379
        }
380
381 2
        if (isset($author_info['user_or_full'])) {
382 2
            $author_info['user_or_full'] = trim($author_info['user_or_full']);
383 2
            if (str_contains($author_info['user_or_full'], ' ')) {
384
                // This value has a space in it, assuming full name
385 2
                $author_info['full_name'] = $author_info['user_or_full'];
386
            } else {
387 1
                $author_info['username'] = $author_info['user_or_full'];
388
            }
389 2
            unset($author_info['user_or_full']);
390
        }
391
392 2
        return $author_info;
393
    }
394
395
    /**
396
     * Parses author formats used by different feed standards and
397
     * tries to match to persons in database.
398
     */
399 2
    public function match_item_author(net_nemein_rss_parser_item $item) : ?midcom_db_person
400
    {
401
        // Parse the item for author information
402 2
        $author_info = self::parse_item_author($item);
403
404 2
        if (!empty($author_info['email'])) {
405
            // Email is a pretty good identifier, start with it
406 2
            $person_qb = midcom_db_person::new_query_builder();
407 2
            $person_qb->add_constraint('email', '=', $author_info['email']);
408 2
            if ($persons = $person_qb->execute()) {
409 1
                return $persons[0];
410
            }
411
        }
412
413 2
        if (   !empty($author_info['username'])
414 2
            && $person = midcom::get()->auth->get_user_by_name($author_info['username'])) {
415 1
            return $person->get_storage();
416
        }
417
418 2
        if (!empty($author_info['full_name'])) {
419 2
            $name_parts = explode(' ', $author_info['full_name']);
420 2
            if (count($name_parts) > 1) {
421
                // We assume the western format Firstname Lastname
422 2
                [$firstname, $lastname] = $name_parts;
423
424 2
                $person_qb = midcom_db_person::new_query_builder();
425 2
                $person_qb->add_constraint('firstname', '=', $firstname);
426 2
                $person_qb->add_constraint('lastname', '=', $lastname);
427 2
                if ($persons = $person_qb->execute()) {
428 1
                    return $persons[0];
429
                }
430
            }
431
        }
432
433 1
        return null;
434
    }
435
436
    /**
437
     * Parses additional metadata in RSS item and sets parameters accordingly
438
     */
439 1
    private function _parse_parameters(midcom_db_article $article, net_nemein_rss_parser_item $item)
440
    {
441 1
        foreach ($item->get_enclosures() as $enclosure) {
442 1
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
443 1
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
444 1
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
445
        }
446
    }
447
448
    /**
449
     * Parses rel-tag links in article content and tags the object based on them
450
     */
451 1
    private function _parse_tags(midcom_db_article $article)
452
    {
453 1
        $crawler = new Crawler($article->content);
454 1
        $nodes = $crawler->filter('a[rel="tag"]');
455
456 1
        $html_tags = $nodes->each(function(Crawler $node) {
457 1
            return [
458 1
                'href' => $node->attr('href') ?? false,
459 1
                'value' => $node->text() ?? false,
460 1
            ];
461 1
        });
462
463 1
        $tags = [];
464
465 1
        foreach ($html_tags as $html_tag) {
466 1
            if (!$html_tag['value']) {
467
                // No actual tag specified, skip
468
                continue;
469
            }
470
471 1
            $tag = strtolower(strip_tags($html_tag['value']));
472 1
            $tags[$tag] = $html_tag['href'];
473
        }
474 1
        if (!empty($tags)) {
475 1
            net_nemein_tag_handler::tag_object($article, $tags, $this->_node->component);
476
        }
477
    }
478
}
479