Passed
Push — master ( 204398...443e32 )
by Chema
01:50 queued 12s
created

News   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 73
Duplicated Lines 0 %

Test Coverage

Coverage 94.44%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 9
eloc 35
c 1
b 0
f 0
dl 0
loc 73
rs 10
ccs 34
cts 36
cp 0.9444

6 Methods

Rating   Name   Duplication   Size   Complexity  
A extractInfo() 0 24 3
A innerHtml() 0 6 1
A normalizeTitle() 0 3 1
A crawlHtml() 0 11 1
A __construct() 0 3 1
A normalizeIncomingDate() 0 9 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Chemaclass\StockTicker\Domain\Crawler\Site\MarketWatch\HtmlCrawler;
6
7
use Chemaclass\StockTicker\Domain\Crawler\Site\MarketWatch\Exception\InvalidDateFormat;
8
use Chemaclass\StockTicker\Domain\Crawler\Site\MarketWatch\HtmlCrawlerInterface;
9
use Chemaclass\StockTicker\Domain\Crawler\Site\Shared\NewsNormalizerInterface;
10
use DateTimeImmutable;
11
use DOMNode;
12
use Symfony\Component\DomCrawler\Crawler;
13
14
final class News implements HtmlCrawlerInterface
15
{
16
    private const SOURCE = 'MarketWatch';
17
18
    private NewsNormalizerInterface $newsNormalizer;
19
20 4
    public function __construct(NewsNormalizerInterface $newsNormalizer)
21
    {
22 4
        $this->newsNormalizer = $newsNormalizer;
23 4
    }
24
25 4
    public function crawlHtml(string $html): array
26
    {
27 4
        $nodes = (new Crawler($html))
28 4
            ->filter('div[data-tab-pane="MarketWatch"] div div.element--article');
29
30 4
        $news = array_map(
31 4
            fn ($node) => $this->extractInfo($node),
32 4
            iterator_to_array($nodes)
33
        );
34
35 4
        return $this->newsNormalizer->limitByMaxToFetch($news);
36
    }
37
38 2
    private function extractInfo(DOMNode $node): array
39
    {
40 2
        $innerHtml = $this->innerHtml($node);
41
42 2
        if (false !== mb_strpos($innerHtml, 'data-srcset=')) {
43 2
            $match = '/<div data-timestamp="(?<timestamp>\d{10})(.|\n)*data-srcset=(?<image>.[^\?]*)(.|\n)*<a class="link" href="(?<url>.*)".*(?<title>(.|\n)*)<\/a>(.|\n)*<span class="article__author">by (?<author>.*)<\/span>/';
44
        } else {
45
            $match = '/<div data-timestamp="(?<timestamp>\d{10})(.|\n)*(.|\n)*<a class="link" href="(?<url>.*)".*(?<title>(.|\n)*)<\/a>(.|\n)*<span class="article__author">by (?<author>.*)<\/span>/';
46
        }
47
48 2
        preg_match(
49 2
            $match,
50 2
            $this->innerHtml($node),
51
            $matches
52
        );
53
54
        return [
55 2
            'source' => self::SOURCE,
56 2
            'author' => $matches['author'],
57 2
            'datetime' => $this->normalizeIncomingDate((int) $matches['timestamp']),
58 2
            'timezone' => $this->newsNormalizer->getTimeZoneName(),
59 2
            'url' => $matches['url'],
60 2
            'title' => $this->newsNormalizer->normalizeText($this->normalizeTitle($matches['title'])),
61 2
            'images' => isset($matches['image']) ? [$matches['image']] : null,
62
        ];
63
    }
64
65 2
    private function innerHtml(DOMNode $node): string
66
    {
67 2
        $doc = new \DOMDocument();
68 2
        $doc->appendChild($doc->importNode($node, true));
69
70 2
        return htmlspecialchars_decode(trim($doc->saveHTML()));
71
    }
72
73 2
    private function normalizeIncomingDate(int $timestamp): string
74
    {
75 2
        $dt = (new DateTimeImmutable())->setTimestamp($timestamp);
76
77 2
        if (false === $dt) {
78
            throw InvalidDateFormat::couldNotCreateFromTimestamp($timestamp);
79
        }
80
81 2
        return $this->newsNormalizer->normalizeDateTime($dt);
82
    }
83
84 2
    private function normalizeTitle(string $title): string
85
    {
86 2
        return trim(str_replace(['\n', '\r'], '', $title));
87
    }
88
}
89