| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | declare(strict_types=1); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | namespace Chemaclass\StockTicker\Domain\Crawler\Site\Barrons\HtmlCrawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | use Chemaclass\StockTicker\Domain\Crawler\Site\Barrons\Exception\InvalidDateFormat; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | use Chemaclass\StockTicker\Domain\Crawler\Site\Barrons\HtmlCrawlerInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | use Chemaclass\StockTicker\Domain\Crawler\Site\Shared\NewsNormalizerInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | use DateTimeImmutable; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | use DOMNode; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | use Symfony\Component\DomCrawler\Crawler; | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 13 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 14 |  |  | final class News implements HtmlCrawlerInterface | 
            
                                                                        
                            
            
                                    
            
            
                | 15 |  |  | { | 
            
                                                                        
                            
            
                                    
            
            
                | 16 |  |  |     private const SOURCE = 'Barrons'; | 
            
                                                                        
                            
            
                                    
            
            
                | 17 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 18 |  |  |     /** | 
            
                                                                        
                            
            
                                    
            
            
                | 19 |  |  |      * TODO: Refactor this logic to use regex instead... Something like this: | 
            
                                                                        
                            
            
                                    
            
            
                | 20 |  |  |      * (?<month>\w{3}) (?<day>\d{1,2}), (?<year>\d{4}) ?(?<time>) | 
            
                                                                        
                            
            
                                    
            
            
                | 21 |  |  |      * | 
            
                                                                        
                            
            
                                    
            
            
                | 22 |  |  |      * @var array<int, string> the key is the length of the incoming date, | 
            
                                                                        
                            
            
                                    
            
            
                | 23 |  |  |      *                         the value is the mask-format that we can apply to it | 
            
                                                                        
                            
            
                                    
            
            
                | 24 |  |  |      */ | 
            
                                                                        
                            
            
                                    
            
            
                | 25 |  |  |     private const DIFF_INCOMING_FORMATS = [ | 
            
                                                                        
                            
            
                                    
            
            
                | 26 |  |  |         11 => 'M d, Y',     // Dec 9, 2020 | 
            
                                                                        
                            
            
                                    
            
            
                | 27 |  |  |         12 => 'M d, Y',     // Dec 13, 2020 | 
            
                                                                        
                            
            
                                    
            
            
                | 28 |  |  |         17 => 'M d, Y H:i', // Dec 9, 2020 8:00 | 
            
                                                                        
                            
            
                                    
            
            
                | 29 |  |  |         18 => 'M d, Y H:i', // Dec 13, 2020 8:00 | 
            
                                                                        
                            
            
                                    
            
            
                | 30 |  |  |     ]; | 
            
                                                                        
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 32 |  |  |     private NewsNormalizerInterface $newsNormalizer; | 
            
                                                                        
                            
            
                                    
            
            
                | 33 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 | 4 |  |     public function __construct(NewsNormalizerInterface $newsNormalizer) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |     { | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 36 | 4 |  |         $this->newsNormalizer = $newsNormalizer; | 
            
                                                                        
                            
            
                                    
            
            
                | 37 | 4 |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 | 4 |  |     public function crawlHtml(string $html): array | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 | 4 |  |         $nodes = (new Crawler($html)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 | 4 |  |             ->filter('#barrons-news-infinite ul li'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 | 4 |  |         $news = array_map( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 | 4 |  |             fn ($node) => $this->extractInfo($node), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 | 4 |  |             iterator_to_array($nodes), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |         ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 49 | 4 |  |         return $this->newsNormalizer->limitByMaxToFetch($news); | 
            
                                                                        
                            
            
                                    
            
            
                | 50 |  |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 52 |  |  |     private function extractInfo(DOMNode $node): array | 
            
                                                                        
                            
            
                                    
            
            
                | 53 |  |  |     { | 
            
                                                                        
                            
            
                                    
            
            
                | 54 |  |  |         preg_match( | 
            
                                                                        
                            
            
                                    
            
            
                | 55 |  |  |             '/^<span class="date">(?<date>.+)<\/span><a href="(?<url>.+)">(?<title>.+)<\/a>/', | 
            
                                                                        
                            
            
                                    
            
            
                | 56 |  |  |             $this->innerHtml($node), | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |             $matches, | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |         ); | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |         return [ | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |             'source' => self::SOURCE, | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |             'datetime' => $this->normalizeIncomingDate($matches['date']), | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |             'timezone' => $this->newsNormalizer->getTimeZoneName(), | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |             'url' => $matches['url'], | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |             'title' => $this->newsNormalizer->normalizeText($matches['title']), | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |         ]; | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |     private function innerHtml(DOMNode $node): string | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |     { | 
            
                                                                        
                            
            
                                    
            
            
                | 71 |  |  |         $innerHtml = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 72 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 73 |  |  |         foreach ($node->childNodes as $child) { | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  |             if ($child->ownerDocument !== null) { | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |                 $innerHtml .= $child->ownerDocument->saveXML($child); | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |             } | 
            
                                                                        
                            
            
                                    
            
            
                | 77 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 78 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 79 |  |  |         return htmlspecialchars_decode($innerHtml); | 
            
                                                                        
                            
            
                                    
            
            
                | 80 |  |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 81 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |     private function normalizeIncomingDate(string $incomingDate): string | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         $incomingDate = trim($incomingDate); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         if (mb_strlen($incomingDate) >= 25) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |             $incomingDate = mb_substr($incomingDate, 0, -8); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         $len = mb_strlen($incomingDate); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         $incomingFormat = self::DIFF_INCOMING_FORMATS[$len] ?? null; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         if ($incomingFormat === null) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |             throw InvalidDateFormat::forIncomingDate($incomingDate); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |         $dt = DateTimeImmutable::createFromFormat($incomingFormat, $incomingDate); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         if ($dt === false) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |             throw InvalidDateFormat::couldNotCreateDateTime($incomingDate, $incomingFormat); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 103 |  |  |         return $this->newsNormalizer->normalizeDateTime($dt); | 
            
                                                                        
                                                                
            
                                    
            
            
                | 104 |  |  |     } | 
            
                                                                        
                                                                
            
                                    
            
            
                | 105 |  |  | } | 
            
                                                                        
                                                                
            
                                    
            
            
                | 106 |  |  |  |