1 | <?php |
||
17 | class AdCrawler extends CrawlerAbstract |
||
18 | { |
||
19 | /** |
||
20 | * @var AdUrlParser |
||
21 | */ |
||
22 | protected $url; |
||
23 | |||
24 | /** |
||
25 | * @param $url |
||
26 | * @return AdUrlParser |
||
27 | */ |
||
28 | 14 | protected function setUrlParser($url) |
|
32 | |||
33 | /** |
||
34 | * Return a full ad information |
||
35 | * |
||
36 | * @return array |
||
37 | */ |
||
38 | 6 | public function getAll() |
|
39 | { |
||
40 | 6 | return array_merge( |
|
41 | [ |
||
42 | 6 | 'id' => $this->getUrlParser()->getId(), |
|
43 | 6 | 'category' => $this->getUrlParser()->getCategory(), |
|
44 | ], |
||
45 | 6 | $this->getPictures(), |
|
46 | 6 | $this->getProperties(), |
|
47 | 6 | $this->getDescription() |
|
48 | ); |
||
49 | } |
||
50 | |||
51 | /** |
||
52 | * Return an array with the Thumbs pictures url |
||
53 | * |
||
54 | * @param Crawler $node |
||
55 | * @return array |
||
56 | */ |
||
57 | 8 | public function getPictures(Crawler $node = null) |
|
58 | { |
||
59 | 8 | $node = $node ?: $this->node; |
|
60 | |||
61 | $images = [ |
||
62 | 8 | 'images_thumbs' => [], |
|
63 | 'images' => [], |
||
64 | ]; |
||
65 | |||
66 | $node |
||
67 | 8 | ->filter('.adview_main script') |
|
68 | ->each(function (Crawler $crawler) use (&$images) { |
||
69 | 8 | if (preg_match_all( |
|
70 | 8 | '#//img.+.leboncoin.fr/.*\.jpg#', |
|
71 | 8 | $crawler->html(), |
|
72 | 8 | $matches |
|
73 | )) { |
||
74 | 8 | foreach ($matches[0] as $image) { |
|
75 | 8 | if (preg_match('/thumb/', $image)) { |
|
76 | 8 | array_push( |
|
77 | 8 | $images['images_thumbs'], |
|
78 | 8 | (string) Http::createFromString($image) |
|
79 | 8 | ->withScheme($this->sheme) |
|
80 | ); |
||
81 | |||
82 | 8 | continue; |
|
83 | } |
||
84 | |||
85 | 8 | array_push( |
|
86 | 8 | $images['images'], |
|
87 | 8 | (string) Http::createFromString($image) |
|
88 | 8 | ->withScheme($this->sheme) |
|
89 | ); |
||
90 | } |
||
91 | } |
||
92 | 8 | }); |
|
93 | |||
94 | 8 | return $images; |
|
95 | } |
||
96 | |||
97 | /** |
||
98 | * Return the common information (price, cp, city) |
||
99 | * |
||
100 | * @param Crawler $node |
||
101 | * |
||
102 | * @return array |
||
103 | */ |
||
104 | 8 | public function getProperties(Crawler $node = null) |
|
105 | { |
||
106 | 8 | $node = $node ?: $this->node; |
|
107 | |||
108 | $properties = [ |
||
109 | 8 | 'titre' => (new DefaultSanitizer)->clean( |
|
110 | 8 | $node->filter('h1')->text() |
|
111 | ), |
||
112 | 'created_at' => $node |
||
113 | 8 | ->filter('*[itemprop=availabilityStarts]') |
|
114 | 8 | ->first() |
|
115 | 8 | ->attr('content'), |
|
116 | 8 | 'is_pro' => ($node->filter('.ispro')->count()), |
|
117 | ]; |
||
118 | |||
119 | 8 | $node->filter('h2') |
|
120 | 8 | ->each(function (Crawler $crawler) use (&$properties) { |
|
121 | 8 | $properties = array_merge( |
|
122 | $properties, |
||
123 | 8 | $this->sanitize( |
|
124 | 8 | $crawler->filter('.property')->text(), |
|
125 | 8 | $crawler->filter('.value')->text() |
|
126 | ) |
||
127 | ); |
||
128 | 8 | }); |
|
129 | |||
130 | 8 | return ['properties' => $properties]; |
|
131 | } |
||
132 | |||
133 | /** |
||
134 | * Return the description |
||
135 | * |
||
136 | * @param Crawler $node |
||
137 | * @return string |
||
138 | */ |
||
139 | 8 | public function getDescription(Crawler $node = null) |
|
150 | |||
151 | /** |
||
152 | * Transform the properties name into a snake_case string and sanitize |
||
153 | * the value |
||
154 | * |
||
155 | * @param string $key |
||
156 | * @param string $value |
||
157 | * @return string |
||
158 | */ |
||
159 | 8 | private function sanitize($key, $value) |
|
180 | } |
||
181 |