News   A
last analyzed

Complexity

Total Complexity 10

Size/Duplication

Total Lines 90
Duplicated Lines 0 %

Test Coverage

Coverage 27.03%

Importance

Changes 0
Metric Value
wmc 10
eloc 41
c 0
b 0
f 0
dl 0
loc 90
ccs 10
cts 37
cp 0.2703
rs 10

5 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A crawlHtml() 0 11 1
A normalizeIncomingDate() 0 22 4
A innerHtml() 0 11 3
A extractInfo() 0 14 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Chemaclass\StockTicker\Domain\Crawler\Site\Barrons\HtmlCrawler;
6
7
use Chemaclass\StockTicker\Domain\Crawler\Site\Barrons\Exception\InvalidDateFormat;
8
use Chemaclass\StockTicker\Domain\Crawler\Site\Barrons\HtmlCrawlerInterface;
9
use Chemaclass\StockTicker\Domain\Crawler\Site\Shared\NewsNormalizerInterface;
10
use DateTimeImmutable;
11
use DOMNode;
12
use Symfony\Component\DomCrawler\Crawler;
13
14
final class News implements HtmlCrawlerInterface
15
{
16
    private const SOURCE = 'Barrons';
17
18
    /**
19
     * TODO: Refactor this logic to use regex instead... Something like this:
20
     * (?<month>\w{3}) (?<day>\d{1,2}), (?<year>\d{4}) ?(?<time>)
21
     *
22
     * @var array<int, string> the key is the length of the incoming date,
23
     *                         the value is the mask-format that we can apply to it
24
     */
25
    private const DIFF_INCOMING_FORMATS = [
26
        11 => 'M d, Y',     // Dec 9, 2020
27
        12 => 'M d, Y',     // Dec 13, 2020
28
        17 => 'M d, Y H:i', // Dec 9, 2020 8:00
29
        18 => 'M d, Y H:i', // Dec 13, 2020 8:00
30
    ];
31
32
    private NewsNormalizerInterface $newsNormalizer;
33
34 4
    public function __construct(NewsNormalizerInterface $newsNormalizer)
35
    {
36 4
        $this->newsNormalizer = $newsNormalizer;
37 4
    }
38
39 4
    public function crawlHtml(string $html): array
40
    {
41 4
        $nodes = (new Crawler($html))
42 4
            ->filter('#barrons-news-infinite ul li');
43
44 4
        $news = array_map(
45 4
            fn ($node) => $this->extractInfo($node),
46 4
            iterator_to_array($nodes),
47
        );
48
49 4
        return $this->newsNormalizer->limitByMaxToFetch($news);
50
    }
51
52
    private function extractInfo(DOMNode $node): array
53
    {
54
        preg_match(
55
            '/^<span class="date">(?<date>.+)<\/span><a href="(?<url>.+)">(?<title>.+)<\/a>/',
56
            $this->innerHtml($node),
57
            $matches,
58
        );
59
60
        return [
61
            'source' => self::SOURCE,
62
            'datetime' => $this->normalizeIncomingDate($matches['date']),
63
            'timezone' => $this->newsNormalizer->getTimeZoneName(),
64
            'url' => $matches['url'],
65
            'title' => $this->newsNormalizer->normalizeText($matches['title']),
66
        ];
67
    }
68
69
    private function innerHtml(DOMNode $node): string
70
    {
71
        $innerHtml = '';
72
73
        foreach ($node->childNodes as $child) {
74
            if ($child->ownerDocument !== null) {
75
                $innerHtml .= $child->ownerDocument->saveXML($child);
76
            }
77
        }
78
79
        return htmlspecialchars_decode($innerHtml);
80
    }
81
82
    private function normalizeIncomingDate(string $incomingDate): string
83
    {
84
        $incomingDate = trim($incomingDate);
85
86
        if (mb_strlen($incomingDate) >= 25) {
87
            $incomingDate = mb_substr($incomingDate, 0, -8);
88
        }
89
90
        $len = mb_strlen($incomingDate);
91
        $incomingFormat = self::DIFF_INCOMING_FORMATS[$len] ?? null;
92
93
        if ($incomingFormat === null) {
94
            throw InvalidDateFormat::forIncomingDate($incomingDate);
95
        }
96
97
        $dt = DateTimeImmutable::createFromFormat($incomingFormat, $incomingDate);
98
99
        if ($dt === false) {
100
            throw InvalidDateFormat::couldNotCreateDateTime($incomingDate, $incomingFormat);
101
        }
102
103
        return $this->newsNormalizer->normalizeDateTime($dt);
104
    }
105
}
106