| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | namespace Lbc\Crawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | use Lbc\Filter\CitySanitizer; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | use Lbc\Filter\CpSanitizer; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | use Lbc\Filter\DefaultSanitizer; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | use Lbc\Filter\KeySanitizer; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | use Lbc\Parser\AdUrlParser; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | use League\Uri\Schemes\Http; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | use Symfony\Component\DomCrawler\Crawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  * Class AdCrawler | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  * @package Lbc\Crawler | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | class AdCrawler extends CrawlerAbstract | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |      * @var AdUrlParser | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |     protected $url; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |      * @param $url | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |      * @return AdUrlParser | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 27 |  |  |      */ | 
            
                                                                        
                            
            
                                    
            
            
                | 28 | 14 |  |     protected function setUrlParser($url) | 
            
                                                                        
                            
            
                                    
            
            
                | 29 |  |  |     { | 
            
                                                                        
                            
            
                                    
            
            
                | 30 | 14 |  |         $this->url = new AdUrlParser($url); | 
            
                                                                        
                            
            
                                    
            
            
                | 31 | 14 |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |      * Return a full ad information | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |      * @return array | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 | 6 |  |     public function getAll() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 | 6 |  |         return array_merge( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |             [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 | 6 |  |                 'id'       => $this->getUrlParser()->getId(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 | 6 |  |                 'category' => $this->getUrlParser()->getCategory(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 | 6 |  |             ], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 | 6 |  |             $this->getPictures(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 | 6 |  |             $this->getProperties(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 | 6 |  |             $this->getDescription() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 | 6 |  |         ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |      * Return an array with the Thumbs pictures url | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |      * @param Crawler $node | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |      * @return array | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 | 8 |  |     public function getPictures(Crawler $node = null) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 | 8 |  |         $node = $node ?: $this->node; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         $images = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 | 8 |  |             'images_thumbs' => [], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 | 8 |  |             'images'        => [], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 | 8 |  |         ]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |         $node | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 | 8 |  |             ->filter('.adview_main script') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |             ->each(function (Crawler $crawler) use (&$images) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 | 8 |  |                 if (preg_match_all( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 | 8 |  |                     '#//img.+.leboncoin.fr/.*\.jpg#', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 | 8 |  |                     $crawler->html(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |                     $matches | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 | 8 |  |                 )) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 | 8 |  |                     foreach ($matches[0] as $image) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 | 8 |  |                         if (preg_match('/thumb/', $image)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 | 8 |  |                             array_push( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 | 8 |  |                                 $images['images_thumbs'], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 | 8 |  |                                 (string)Http::createFromString($image) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 | 8 |  |                                     ->withScheme($this->sheme) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 | 8 |  |                             ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 | 8 |  |                             continue; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |                         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 | 8 |  |                         array_push( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 | 8 |  |                             $images['images'], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 | 8 |  |                             (string)Http::createFromString($image) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 | 8 |  |                                 ->withScheme($this->sheme) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 | 8 |  |                         ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 | 8 |  |                     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 | 8 |  |                 } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 | 8 |  |             }); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 | 8 |  |         return $images; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |      * Return the common information (price, cp, city) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |      * @param Crawler $node | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |      * @return array | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 | 8 |  |     public function getProperties(Crawler $node = null) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 | 8 |  |         $node = $node ?: $this->node; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         $properties = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 | 8 |  |             'titre'      => DefaultSanitizer::clean( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 | 8 |  |                 $node->filter('h1')->text() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 | 8 |  |             ), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |             'created_at' => $node | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 | 8 |  |                 ->filter('*[itemprop=availabilityStarts]') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 | 8 |  |                 ->first() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 | 8 |  |                 ->attr('content'), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 | 8 |  |             'is_pro' => ($node->filter('.ispro')->count()), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 | 8 |  |         ]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 | 8 |  |         $node->filter('h2') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 | 8 |  |             ->each(function (Crawler $crawler) use (&$properties) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 | 8 |  |                 $properties = array_merge( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 | 8 |  |                     $properties, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 | 8 |  |                     $this->sanitize( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 | 8 |  |                         $crawler->filter('.property')->text(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 | 8 |  |                         $crawler->filter('.value')->text() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 | 8 |  |                     ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 | 8 |  |                 ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 | 8 |  |             }); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 | 8 |  |         return ['properties' => $properties]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |      * Return the description | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |      * @param Crawler $node | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |      * @return string | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 | 8 |  |     public function getDescription(Crawler $node = null) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 | 8 |  |         $node = $node ?: $this->node; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |         return [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 | 8 |  |             'description' => $this->getFieldValue( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 | 8 |  |                 $node->filter("p[itemprop=description]"), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |                 null | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 | 8 |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 | 8 |  |         ]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |      * Transform the properties name into a snake_case string and sanitize | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |      * the value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |      * @param string $key | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |      * @param string $value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |      * @return string | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 | 8 |  |     private function sanitize($key, $value) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 | 8 |  |         $key = KeySanitizer::clean($key); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 | 8 |  |         if ($key == 'ville') { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |             return [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 | 8 |  |                 'ville' => CitySanitizer::clean($value), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 | 8 |  |                 'cp'    => CpSanitizer::clean($value), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 | 8 |  |             ]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 | 8 |  |         $filterName = 'Lbc\\Filter\\' . ucfirst($key) . 'Sanitizer'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 | 8 |  |         if (!class_exists($filterName)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 | 8 |  |             $filterName = 'Lbc\\Filter\\DefaultSanitizer'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 | 8 |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 | 8 |  |         return [$key => call_user_func("$filterName::clean", $value)]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |     } | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 178 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 179 |  |  |  |