Passed
Push — master ( aea92c...5347c1 )
by Andreas
28:06 queued 12s
created

net_nemein_rss_fetch::parse_item_author()   B

Complexity

Conditions 8
Paths 33

Size

Total Lines 45
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 8.0368

Importance

Changes 0
Metric Value
cc 8
eloc 27
c 0
b 0
f 0
nc 33
nop 1
dl 0
loc 45
ccs 22
cts 24
cp 0.9167
crap 8.0368
rs 8.4444
1
<?php
2
/**
3
 * @package net.nemein.rss
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
use Symfony\Component\DomCrawler\Crawler;
10
use midcom\dba\softdelete;
11
12
/**
13
 * RSS and Atom feed fetching class. Caches the fetched items as articles
14
 * in net.nehmer.blog
15
 *
16
 * @package net.nemein.rss
17
 */
18
class net_nemein_rss_fetch
19
{
20
    /**
21
     * The last error reported by SimplePie, if any
22
     */
23
    public $lasterror;
24
25
    /**
26
     * @var net_nemein_rss_feed_dba
27
     */
28
    private $_feed;
29
30
    /**
31
     * Property of midcom_db_article we're using for storing the feed item GUIDs
32
     */
33
    private $_guid_property = 'extra2';
34
35
    /**
36
     * Current node we're importing to
37
     *
38
     * @var midcom_db_topic
39
     */
40
    private $_node;
41
42
    /**
43
     * Initializes the class with a given feed
44
     */
45 2
    public function __construct(net_nemein_rss_feed_dba $feed)
46
    {
47 2
        $this->_feed = $feed;
48 2
        $this->_node = new midcom_db_topic($feed->node);
49 2
    }
50
51 2
    public static function get_parser() : SimplePie
52
    {
53 2
        $parser = new SimplePie;
54 2
        $parser->get_registry()->register('Item', net_nemein_rss_parser_item::class);
55 2
        $parser->set_output_encoding(midcom::get()->i18n->get_current_charset());
56 2
        $parser->set_cache_location(midcom::get()->config->get('midcom_tempdir'));
57 2
        return $parser;
58
    }
59
60
    /**
61
     * Actually fetch a feed
62
     */
63
    public static function raw_fetch(string $url) : SimplePie
64
    {
65
        $parser = self::get_parser();
66
        $parser->set_feed_url($url);
67
        $parser->init();
68
        return $parser;
69
    }
70
71
    /**
72
     * Fetch given RSS or Atom feed
73
     *
74
     * @return net_nemein_rss_parser_item[] Array of normalized feed items
75
     */
76
    function fetch() : array
77
    {
78
        $parser = self::raw_fetch($this->_feed->url);
79
        if ($parser->error()) {
80
            $this->lasterror = $parser->error();
81
            return [];
82
        }
83
        if (!empty($parser->data['headers']['etag'])) {
84
            // Etag checking
85
            $etag = trim($parser->data['headers']['etag']);
86
87
            $feed_etag = $this->_feed->get_parameter('net.nemein.rss', 'etag');
88
            if (   !empty($feed_etag)
89
                && $feed_etag == $etag) {
90
                // Feed hasn't changed, skip updating
91
                debug_add("Feed {$this->_feed->url} has not changed since " . date('c', $this->_feed->latestfetch), MIDCOM_LOG_WARN);
92
                return [];
93
            }
94
95
            $this->_feed->set_parameter('net.nemein.rss', 'etag', $etag);
96
        }
97
98
        $this->_feed->latestfetch = time();
99
        $this->_feed->_use_rcs = false;
100
        $this->_feed->update();
101
102
        return $parser->get_items();
103
    }
104
105
    /**
106
     * Fetches and imports items in the feed
107
     */
108
    public function import() : array
109
    {
110
        if (!$this->_node->component) {
111
            return [];
112
        }
113
114
        $items = $this->fetch();
115
116
        if (empty($items)) {
117
            // This feed didn't return any items, skip
118
            return [];
119
        }
120
121
        // Reverse items so that creation times remain in correct order even for feeds without timestamps
122
        $items = array_reverse($items);
123
124
        foreach ($items as $item) {
125
            if ($guid = $this->import_item($item)) {
126
                $item->set_local_guid($guid);
127
                debug_add("Imported item " . $item->get_id() . ' as ' . $guid, MIDCOM_LOG_INFO);
128
            } else {
129
                debug_add("Failed to import item " . $item->get_id() . ': ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
130
            }
131
        }
132
133
        $this->clean($items);
134
135
        return array_reverse($items);
136
    }
137
138
    /**
139
     * Imports a feed item into the database
140
     */
141 1
    public function import_item(net_nemein_rss_parser_item $item) : ?string
142
    {
143 1
        if ($this->_node->component !== 'net.nehmer.blog') {
144
            throw new midcom_error("RSS fetching for component {$this->_node->component} is unsupported");
145
        }
146 1
        return $this->import_article($item);
147
    }
148
149
    /**
150
     * Imports an item as a news article
151
     */
152 1
    private function import_article(net_nemein_rss_parser_item $item) : ?string
153
    {
154 1
        $guid = $item->get_id();
155 1
        $title = $item->get_title();
156
157 1
        if (   (   empty($title)
158 1
                || trim($title) == '...')
159 1
            && empty($guid)) {
160
            // Something wrong with this entry, skip it
161
            return null;
162
        }
163
164 1
        $article = $this->find_article($item, $guid);
165 1
        if (!$article) {
166
            return null;
167
        }
168
169 1
        $article->allow_name_catenate = true;
170 1
        $article->set_rcs_message(sprintf(midcom::get()->i18n->get_string('%s was imported from %s', 'net.nemein.rss'), $title, $this->_feed->title));
171
172
        $values = [
173 1
            'title' => $title,
174 1
            $this->_guid_property => $guid, // FIXME: This breaks with URLs longer than 255 chars
175 1
            'content' => $item->get_content(),
176 1
            'url' => $item->get_link(),
177 1
            'extra1' => '|feed:' . md5($this->_feed->url) . '|',
178
        ];
179 1
        $meta_values = [];
180
181
        // Safety, make sure we have sane name (the allow_catenate was set earlier, so this will not clash
182 1
        if (empty($article->name)) {
183 1
            $values['name'] = midcom_helper_misc::urlize($title);
184
        }
185
186 1
        $categories = $item->get_categories();
187 1
        if (is_array($categories)) {
188
            // Handle categories provided in the feed
189 1
            foreach ($categories as $category) {
190
                // Clean up the categories and save
191 1
                $category = str_replace('|', '_', trim($category->get_term()));
192 1
                $values['extra1'] .= "{$category}|";
193
            }
194
        }
195
196 1
        $article_author = $this->find_author($item);
197 1
        if (!empty($article_author->guid)) {
198 1
            $meta_values['authors'] = "|{$article_author->guid}|";
199
        }
200
201
        // Try to figure out item publication date
202 1
        $article_date = $item->get_date('U');
203
204 1
        $article_data_tweaked = false;
205 1
        if (!$article_date) {
206
            $article_date = time();
207
            $article_data_tweaked = true;
208
        }
209
210 1
        if ($article_date > $this->_feed->latestupdate) {
211
            // Cache "latest updated" time to feed
212 1
            $this->_feed->latestupdate = $article_date;
213 1
            $this->_feed->_use_rcs = false;
214 1
            $this->_feed->update();
215
        }
216
217 1
        if ($article->id) {
218 1
            if (!$article_data_tweaked) {
219 1
                $meta_values['published'] = $article_date;
220
            }
221
222 1
            if (   $this->apply_values($article, $values, $meta_values)
223 1
                && !$article->update()) {
224 1
                return null;
225
            }
226
        } else {
227 1
            $this->apply_values($article, $values, $meta_values);
228 1
            if (!$article->create()) {
229
                return null;
230
            }
231
        }
232
233 1
        if ($this->_feed->autoapprove) {
234
            $article->metadata->approve();
235
        }
236
237 1
        $this->_parse_tags($article);
238 1
        $this->_parse_parameters($article, $item);
239
240
        // store <link rel="replies"> url in parameter
241 1
        if ($item->get_link(0, 'replies')) {
242
            $article->set_parameter('net.nemein.rss', 'replies_url', $item->get_link(0, 'replies'));
243
        }
244
245 1
        return $article->guid;
246
    }
247
248 1
    private function find_author(net_nemein_rss_parser_item $item) : midcom_db_person
249
    {
250
        // Try to figure out item author
251 1
        if (   $this->_feed->forceauthor
252 1
            && $this->_feed->defaultauthor) {
253
            // Feed has a "default author" set, use it
254
            return new midcom_db_person($this->_feed->defaultauthor);
255
        }
256 1
        $author = $this->match_item_author($item);
257 1
        $fallback_person_id = 1;
258 1
        if (   !$author
259 1
            || $author->id == $fallback_person_id) {
260 1
            if ($this->_feed->defaultauthor) {
261
                // Feed has a "default author" set, use it
262
                $author = new midcom_db_person($this->_feed->defaultauthor);
263
            } else {
264
                // Fall back to "Midgard Admin" just in case
265 1
                $author = new midcom_db_person($fallback_person_id);
266
            }
267
        }
268 1
        return $author;
269
    }
270
271 1
    private function find_article(net_nemein_rss_parser_item $item, string $guid) : ?midcom_db_article
272
    {
273 1
        $qb = midcom_db_article::new_query_builder();
274 1
        $qb->add_constraint('topic', '=', $this->_feed->node);
275 1
        $qb->add_constraint($this->_guid_property, '=', substr($guid, 0, 255));
276 1
        $articles = $qb->execute();
277 1
        if (!empty($articles)) {
278
            // This item has been imported already earlier. Update
279 1
            return $articles[0];
280
        }
281
282
        // Check against duplicate hits that may come from different feeds
283 1
        if ($link = $item->get_link()) {
284 1
            $qb = midcom_db_article::new_query_builder();
285 1
            $qb->add_constraint('topic', '=', $this->_feed->node);
286 1
            $qb->add_constraint('url', '=', $link);
287 1
            if ($qb->count() > 0) {
288
                // Dupe, skip
289
                return null;
290
            }
291
        }
292
293
        // This is a new item
294 1
        $article = new midcom_db_article();
295 1
        $article->topic = $this->_feed->node;
296 1
        return $article;
297
    }
298
299 1
    private function apply_values(midcom_db_article $article, array $values, array $meta_values) : bool
300
    {
301 1
        $updated = false;
302
303 1
        foreach ($values as $fieldname => $value) {
304 1
            if ($article->$fieldname !== $value) {
305 1
                $article->$fieldname = $value;
306 1
                $updated = true;
307
            }
308
        }
309
310 1
        foreach ($meta_values as $fieldname => $value) {
311 1
            if ($article->metadata->$fieldname !== $value) {
312 1
                $article->metadata->$fieldname = $value;
313 1
                $updated = true;
314
            }
315
        }
316
317 1
        return $updated;
318
    }
319
320
    /**
321
     * Cleans up old, removed items from feeds
322
     *
323
     * @param net_nemein_rss_parser_item[] $items
324
     */
325
    private function clean(array $items)
326
    {
327
        if ($this->_feed->keepremoved) {
328
            // This feed is set up so that we retain items removed from array
329
            return;
330
        }
331
332
        // Create array of item GUIDs
333
        $item_guids = [];
334
        foreach ($items as $item) {
335
            $item_guids[] = $item->get_id();
336
        }
337
338
        // Find articles resulting from this feed
339
        $qb = midcom_db_article::new_query_builder();
340
        $feed_category = md5($this->_feed->url);
341
        $qb->add_constraint('extra1', 'LIKE', "%|feed:{$feed_category}|%");
342
        $qb->add_constraint($this->_guid_property, 'NOT IN', $item_guids);
343
        $local_items = $qb->execute_unchecked();
344
        $purge_guids = [];
345
        foreach ($local_items as $item) {
346
            $purge_guids[] = $item->guid;
347
            $item->delete();
348
        }
349
350
        softdelete::purge($purge_guids, 'midgard_article');
351
    }
352
353
    /**
354
     * Parses author formats used by different feed standards and
355
     * and returns the information
356
     */
357 2
    public static function parse_item_author(net_nemein_rss_parser_item $item) : array
358
    {
359 2
        $author_info = [];
360
361
        // First try dig up any information about the author possible
362 2
        if ($author = $item->get_author()) {
363 2
            $name = $author->get_name();
364 2
            $email = $author->get_email();
365 2
            if (!empty($name)) {
366
                $name = html_entity_decode($name, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
367
                // Atom feed, the value can be either full name or username
368
                $author_info['user_or_full'] = $name;
369
            } else {
370 2
                $name = html_entity_decode($email, ENT_QUOTES, midcom::get()->i18n->get_current_charset());
371
            }
372
373 2
            if (!preg_match('/(<|\()/', $name)) {
374 1
                $author_info['user_or_full'] = $name;
375
            } else {
376 2
                if (strstr($name, '<')) {
377
                    // The classic "Full Name <email>" format
378 1
                    $regex = '/(?<fullname>.+) <?(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+)>?[ ,]?/';
379
                } else {
380
                    // The classic "email (Full Name)" format
381 1
                    $regex = '/^(?<email>[a-zA-Z0-9_.-]+?@[a-zA-Z0-9_.-]+) \((?<fullname>.+)\)$/';
382
                }
383 2
                if (preg_match($regex, $name, $matches)) {
384 2
                    $author_info['email'] = $matches['email'];
385 2
                    $author_info['user_or_full'] = $matches['fullname'];
386
                }
387
            }
388
        }
389
390 2
        if (isset($author_info['user_or_full'])) {
391 2
            $author_info['user_or_full'] = trim($author_info['user_or_full']);
392 2
            if (strstr($author_info['user_or_full'], ' ')) {
393
                // This value has a space in it, assuming full name
394 2
                $author_info['full_name'] = $author_info['user_or_full'];
395
            } else {
396 1
                $author_info['username'] = $author_info['user_or_full'];
397
            }
398 2
            unset($author_info['user_or_full']);
399
        }
400
401 2
        return $author_info;
402
    }
403
404
    /**
405
     * Parses author formats used by different feed standards and
406
     * tries to match to persons in database.
407
     */
408 2
    public function match_item_author(net_nemein_rss_parser_item $item) : ?midcom_db_person
409
    {
410
        // Parse the item for author information
411 2
        $author_info = self::parse_item_author($item);
412
413 2
        if (!empty($author_info['email'])) {
414
            // Email is a pretty good identifier, start with it
415 2
            $person_qb = midcom_db_person::new_query_builder();
416 2
            $person_qb->add_constraint('email', '=', $author_info['email']);
417 2
            $persons = $person_qb->execute();
418 2
            if (!empty($persons)) {
419 1
                return $persons[0];
420
            }
421
        }
422
423 2
        if (   !empty($author_info['username'])
424 2
            && $person = midcom::get()->auth->get_user_by_name($author_info['username'])) {
425 1
            return $person->get_storage();
426
        }
427
428 2
        if (!empty($author_info['full_name'])) {
429 2
            $name_parts = explode(' ', $author_info['full_name']);
430 2
            if (count($name_parts) > 1) {
431
                // We assume the western format Firstname Lastname
432 2
                $firstname = $name_parts[0];
433 2
                $lastname = $name_parts[1];
434
435 2
                $person_qb = midcom_db_person::new_query_builder();
436 2
                $person_qb->add_constraint('firstname', '=', $firstname);
437 2
                $person_qb->add_constraint('lastname', '=', $lastname);
438 2
                $persons = $person_qb->execute();
439 2
                if (!empty($persons)) {
440 1
                    return $persons[0];
441
                }
442
            }
443
        }
444
445 1
        return null;
446
    }
447
448
    /**
449
     * Parses additional metadata in RSS item and sets parameters accordingly
450
     */
451 1
    private function _parse_parameters(midcom_db_article $article, net_nemein_rss_parser_item $item)
452
    {
453 1
        foreach ($item->get_enclosures() as $enclosure) {
454 1
            $article->set_parameter('net.nemein.rss:enclosure', 'url', $enclosure->get_link());
455 1
            $article->set_parameter('net.nemein.rss:enclosure', 'duration', $enclosure->get_duration());
456 1
            $article->set_parameter('net.nemein.rss:enclosure', 'mimetype', $enclosure->get_type());
457
        }
458 1
    }
459
460
    /**
461
     * Parses rel-tag links in article content and tags the object based on them
462
     */
463 1
    private function _parse_tags(midcom_db_article $article)
464
    {
465 1
        $crawler = new Crawler($article->content);
466 1
        $nodes = $crawler->filter('a[rel="tag"]');
467
468
        $html_tags = $nodes->each(function(Crawler $node) {
469
            return [
470 1
                'href' => $node->attr('href') ?? false,
471 1
                'value' => $node->text() ?? false,
472
            ];
473 1
        });
474
475 1
        $tags = [];
476
477 1
        foreach ($html_tags as $html_tag) {
478 1
            if (!$html_tag['value']) {
479
                // No actual tag specified, skip
480
                continue;
481
            }
482
483 1
            $tag = strtolower(strip_tags($html_tag['value']));
484 1
            $tags[$tag] = $html_tag['href'];
485
        }
486 1
        if (!empty($tags)) {
487 1
            net_nemein_tag_handler::tag_object($article, $tags, $this->_node->component);
488
        }
489 1
    }
490
}
491