News::normalizeIncomingDate()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 2
dl 0
loc 5
ccs 3
cts 3
cp 1
rs 10
c 1
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Chemaclass\StockTicker\Domain\Crawler\Site\MarketWatch\HtmlCrawler;
6
7
use Chemaclass\StockTicker\Domain\Crawler\Site\MarketWatch\HtmlCrawlerInterface;
8
use Chemaclass\StockTicker\Domain\Crawler\Site\Shared\NewsNormalizerInterface;
9
use DateTimeImmutable;
10
use DOMDocument;
11
use DOMNode;
12
use Symfony\Component\DomCrawler\Crawler;
13
14
final class News implements HtmlCrawlerInterface
15
{
16
    private const SOURCE = 'MarketWatch';
17
18
    private NewsNormalizerInterface $newsNormalizer;
19
20 4
    public function __construct(NewsNormalizerInterface $newsNormalizer)
21
    {
22 4
        $this->newsNormalizer = $newsNormalizer;
23 4
    }
24
25 4
    public function crawlHtml(string $html): array
26
    {
27 4
        $nodes = (new Crawler($html))
28 4
            ->filter('div[data-tab-pane="MarketWatch"] div div.element--article');
29
30 4
        $news = array_map(
31 4
            fn ($node) => $this->extractInfo($node),
32 4
            iterator_to_array($nodes),
33
        );
34
35 4
        return $this->newsNormalizer->limitByMaxToFetch($news);
36
    }
37
38 2
    private function extractInfo(DOMNode $node): array
39
    {
40 2
        $innerHtml = $this->innerHtml($node);
41
42 2
        if (mb_strpos($innerHtml, 'data-srcset=') !== false) {
43 2
            $match = '/<div data-timestamp="(?<timestamp>\d{10})(.|\n)*data-srcset=(?<image>.[^\?]*)(.|\n)*<a class="link" href="(?<url>.*)".*(?<title>(.|\n)*)<\/a>(.|\n)*<span class="article__author">by (?<author>.*)<\/span>/';
44
        } else {
45 2
            $match = '/<div data-timestamp="(?<timestamp>\d{10})(.|\n)*(.|\n)*<a class="link" href="(?<url>.*)".*(?<title>(.|\n)*)<\/a>(.|\n)*<span class="article__author">by (?<author>.*)<\/span>/';
46
        }
47
48 2
        preg_match(
49 2
            $match,
50 2
            $this->innerHtml($node),
51
            $matches,
52
        );
53
54
        return [
55 2
            'source' => self::SOURCE,
56 2
            'author' => $matches['author'] ?? 'Unknown author',
57 2
            'datetime' => $this->normalizeIncomingDate((int) ($matches['timestamp'] ?? 0)),
58 2
            'timezone' => $this->newsNormalizer->getTimeZoneName(),
59 2
            'url' => $matches['url'] ?? 'Unknown url',
60 2
            'title' => $this->newsNormalizer->normalizeText($this->normalizeTitle($matches['title'] ?? 'Unknown title', )),
61 2
            'images' => isset($matches['image']) ? [$matches['image']] : null,
62
        ];
63
    }
64
65 2
    private function innerHtml(DOMNode $node): string
66
    {
67 2
        $doc = new DOMDocument();
68 2
        $doc->appendChild($doc->importNode($node, true));
69
70 2
        return htmlspecialchars_decode(trim($doc->saveHTML()));
71
    }
72
73 2
    private function normalizeIncomingDate(int $timestamp): string
74
    {
75 2
        $dt = (new DateTimeImmutable())->setTimestamp($timestamp);
76
77 2
        return $this->newsNormalizer->normalizeDateTime($dt);
78
    }
79
80
    private function normalizeTitle(string $title): string
81 2
    {
82
        return trim(str_replace(['\n', '\r'], '', $title));
83
    }
84
}
85