1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Lbc\Crawler; |
4
|
|
|
|
5
|
|
|
use Lbc\Filter\DefaultSanitizer; |
6
|
|
|
use Lbc\Filter\PrixSanitizer; |
7
|
|
|
use Lbc\Parser\AdUrlParser; |
8
|
|
|
use Lbc\Parser\SearchResultUrlParser; |
9
|
|
|
use League\Uri\Schemes\Http; |
10
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Class SearchResultAdCrawler |
14
|
|
|
* @package Lbc\Crawler |
15
|
|
|
*/ |
16
|
|
|
class SearchResultAdCrawler extends CrawlerAbstract |
17
|
|
|
{ |
18
|
|
|
/** |
19
|
|
|
* @var AdUrlParser |
20
|
|
|
*/ |
21
|
|
|
protected $url; |
22
|
|
|
|
23
|
|
|
/** |
24
|
|
|
* @param $url |
25
|
|
|
* @return SearchResultUrlParser |
26
|
|
|
*/ |
27
|
10 |
|
protected function setUrlParser($url) |
28
|
|
|
{ |
29
|
10 |
|
$this->url = new AdUrlParser($url); |
30
|
10 |
|
} |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* Return the Ad's ID |
34
|
|
|
* |
35
|
|
|
* @return string |
36
|
|
|
*/ |
37
|
10 |
|
public function getId() |
38
|
|
|
{ |
39
|
10 |
|
return $this->url->getId(); |
40
|
|
|
} |
41
|
|
|
|
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* Return the title |
45
|
|
|
* |
46
|
|
|
* @return string |
47
|
|
|
*/ |
48
|
10 |
|
public function getTitle() |
49
|
|
|
{ |
50
|
10 |
|
return DefaultSanitizer::clean($this->node->filter('h2')->text()); |
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* Return the price |
55
|
|
|
* |
56
|
|
|
* @return int |
57
|
|
|
*/ |
58
|
10 |
|
public function getPrice() |
59
|
|
|
{ |
60
|
10 |
|
if ($this->node->filter('*[itemprop=price]')->count()) { |
61
|
10 |
|
return PrixSanitizer::clean( |
62
|
10 |
|
$this->node->filter('*[itemprop=price]')->text() |
63
|
10 |
|
); |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
return 0; |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Return the Ad's URL |
71
|
|
|
* |
72
|
|
|
* @return string |
73
|
|
|
*/ |
74
|
10 |
|
public function getUrl() |
75
|
|
|
{ |
76
|
10 |
|
return (string)Http::createFromString($this->url)->withScheme('https'); |
77
|
|
|
} |
78
|
|
|
|
79
|
|
|
/** |
80
|
|
|
* Return the data and time the ad was created |
81
|
|
|
* |
82
|
|
|
* @return string |
83
|
|
|
*/ |
84
|
10 |
|
public function getCreatedAt() |
85
|
|
|
{ |
86
|
10 |
|
return $this->node |
87
|
10 |
|
->filter('*[itemprop=availabilityStarts]') |
88
|
10 |
|
->first() |
89
|
10 |
|
->attr('content'); |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* Return the thumb picture url |
94
|
|
|
* |
95
|
|
|
* @return null|string |
96
|
|
|
*/ |
97
|
10 |
|
public function getThumb() |
98
|
|
|
{ |
99
|
10 |
|
$image = $this->node |
100
|
10 |
|
->filter('.item_imagePic .lazyload[data-imgsrc]') |
101
|
10 |
|
->first(); |
102
|
|
|
|
103
|
10 |
|
if (0 === $image->count()) { |
104
|
|
|
return null; |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
$src = $image |
108
|
10 |
|
->attr('data-imgsrc'); |
109
|
|
|
|
110
|
10 |
|
return (string)Http::createFromString($src)->withScheme('https'); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
/** |
114
|
|
|
* Return the number of picture of the ad |
115
|
|
|
* |
116
|
|
|
* @return int |
117
|
|
|
*/ |
118
|
10 |
|
public function getNbImage() |
119
|
|
|
{ |
120
|
10 |
|
$node = $this->node->filter('.item_imageNumber'); |
121
|
|
|
|
122
|
|
|
return $this->getFieldValue($node, 0, function ($value) { |
123
|
10 |
|
return (int)trim($value); |
124
|
10 |
|
}); |
125
|
|
|
} |
126
|
|
|
|
127
|
|
|
/** |
128
|
|
|
* @return mixed |
129
|
|
|
*/ |
130
|
10 |
View Code Duplication |
public function getPlacement() |
|
|
|
|
131
|
|
|
{ |
132
|
10 |
|
$node = $this->node->filter('*[itemprop=availableAtOrFrom]'); |
133
|
|
|
|
134
|
|
|
return $this->getFieldValue($node, '', function ($value) { |
135
|
10 |
|
return preg_replace('/\s+/', ' ', trim($value)); |
136
|
10 |
|
}); |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
/** |
140
|
|
|
* @return mixed |
141
|
|
|
*/ |
142
|
10 |
View Code Duplication |
public function getType() |
|
|
|
|
143
|
|
|
{ |
144
|
10 |
|
$node = $this->node->filter('*[itemprop=category]'); |
145
|
|
|
|
146
|
10 |
|
return $this->getFieldValue($node, false, function ($value) { |
147
|
10 |
|
if ('pro' === preg_replace('/[\s()]+/', '', $value)) { |
148
|
8 |
|
return 'pro'; |
149
|
|
|
} |
150
|
|
|
|
151
|
10 |
|
return 'part'; |
152
|
10 |
|
}); |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* @return object |
157
|
|
|
*/ |
158
|
10 |
|
public function getAll() |
159
|
|
|
{ |
160
|
|
|
return (object)[ |
161
|
10 |
|
'id' => $this->getId(), |
162
|
10 |
|
'titre' => $this->getTitle(), |
163
|
10 |
|
'prix' => $this->getPrice(), |
164
|
10 |
|
'url' => $this->getUrl(), |
165
|
10 |
|
'created_at' => $this->getCreatedAt(), |
166
|
10 |
|
'images_thumbs' => $this->getThumb(), |
167
|
10 |
|
'nb_image' => $this->getNbImage(), |
168
|
10 |
|
'placement' => $this->getPlacement(), |
169
|
10 |
|
'type' => $this->getType(), |
170
|
10 |
|
]; |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
/** |
174
|
|
|
* Return the field's value |
175
|
|
|
* |
176
|
|
|
* @param Crawler $node |
177
|
|
|
* @param mixed $defaultValue |
178
|
|
|
* @param \Closure $callback |
179
|
|
|
* @param string $funcName |
180
|
|
|
* @param string $funcParam |
181
|
|
|
* |
182
|
|
|
* @return mixed |
183
|
|
|
*/ |
184
|
10 |
|
private function getFieldValue( |
185
|
|
|
Crawler $node, |
186
|
|
|
$defaultValue, |
187
|
|
|
$callback, |
188
|
|
|
$funcName = 'text', |
189
|
|
|
$funcParam = '' |
190
|
|
|
) { |
191
|
10 |
|
if ($node->count()) { |
192
|
10 |
|
return $callback($node->$funcName($funcParam)); |
193
|
|
|
} |
194
|
|
|
|
195
|
|
|
return $defaultValue; |
196
|
|
|
} |
197
|
|
|
} |
198
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.