1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Lbc\Crawler; |
4
|
|
|
|
5
|
|
|
use League\Uri\Schemes\Http; |
6
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* At the moment I'm writing this piece of code, an ads follow this |
10
|
|
|
* structure: |
11
|
|
|
* |
12
|
|
|
* <a href="http://www.leboncoin.fr/{{ $category }}/{{ $id }}.htm?ca=4_s" title="{{ $title }}"> |
13
|
|
|
* <div class="lbc"> |
14
|
|
|
* <div class="date"> |
15
|
|
|
* <div>{{ $date }}</div> |
16
|
|
|
* <div>{{ $time }}</div> |
17
|
|
|
* </div> |
18
|
|
|
* <div class="image"> |
19
|
|
|
* <div class="image-and-nb"> |
20
|
|
|
* <img src="{{ $imageThumbUrl }}" alt="{{ $title }}"> |
21
|
|
|
* <div class="nb"> |
22
|
|
|
* <div class="top radius"> </div> |
23
|
|
|
* <div class="value radius">{{ $nbImages}}</div> |
24
|
|
|
* </div> |
25
|
|
|
* </div> |
26
|
|
|
* </div> |
27
|
|
|
* <div class="detail"> |
28
|
|
|
* <div class="title">{{ $title }}</div> |
29
|
|
|
* <div class="category">{{ $pro }}</div> |
30
|
|
|
* <div class="placement">{{ $placement }}</div> |
31
|
|
|
* <div class="price">{{ $price }} €</div> |
32
|
|
|
* </div> |
33
|
|
|
* </div> |
34
|
|
|
* </a> |
35
|
|
|
*/ |
36
|
|
|
class SearchResultAdCrawler |
37
|
|
|
{ |
38
|
|
|
protected $node; |
39
|
|
|
protected $url; |
40
|
|
|
|
41
|
10 |
|
public function __construct(Crawler $node) |
42
|
|
|
{ |
43
|
10 |
|
$this->node = $node; |
44
|
10 |
|
$this->url = $node->attr('href'); |
45
|
10 |
|
} |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* Return the Ad's ID |
49
|
|
|
* |
50
|
|
|
* @return string |
51
|
|
|
*/ |
52
|
10 |
|
public function getId() |
53
|
|
|
{ |
54
|
10 |
|
$path = parse_url($this->url)['path']; |
55
|
|
|
|
56
|
10 |
|
return preg_replace('/\/\w+\/(\d+)\.htm/', '$1', $path); |
57
|
|
|
} |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* Return the title |
61
|
|
|
* |
62
|
|
|
* @return mixed |
63
|
|
|
*/ |
64
|
6 |
|
public function getTitle() |
65
|
|
|
{ |
66
|
|
|
return $this->getFieldValue($this->node, 0, function ($value) { |
67
|
6 |
|
return trim($value); |
68
|
6 |
|
}, 'attr', 'title'); |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Return the price |
73
|
|
|
* |
74
|
|
|
* @return int |
75
|
|
|
*/ |
76
|
6 |
|
public function getPrice() |
77
|
|
|
{ |
78
|
6 |
|
$node = $this->node->filter('*[itemprop=price]'); |
79
|
|
|
|
80
|
|
|
return $this->getFieldValue($node, 0, function ($value) { |
81
|
6 |
|
return (int) preg_replace('/\D/', '', trim($value)); |
82
|
6 |
|
}); |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* Return the Ad's URL |
87
|
|
|
* |
88
|
|
|
* @return string |
89
|
|
|
*/ |
90
|
6 |
|
public function getUrl() |
91
|
|
|
{ |
92
|
6 |
|
return (string)Http::createFromString($this->url)->withScheme('http'); |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* Return the data and time the ad was created |
97
|
|
|
* |
98
|
|
|
* @return string |
99
|
|
|
*/ |
100
|
6 |
|
public function getCreatedAt() |
101
|
|
|
{ |
102
|
6 |
|
$node = $this->node |
103
|
6 |
|
->filter('*[itemprop=availabilityStarts]') |
104
|
6 |
|
->first() |
105
|
3 |
|
; |
106
|
|
|
|
107
|
6 |
|
$date = $node->attr('content'); |
108
|
|
|
|
109
|
|
|
$time = $this->getFieldValue($node, 0, function ($value) { |
110
|
6 |
|
$value = trim($value); |
111
|
|
|
|
112
|
6 |
|
return substr($value, strpos($value, ',') + 2); |
113
|
6 |
|
}); |
114
|
|
|
|
115
|
6 |
|
return $date.' '.$time; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* Return the thumb picture url |
120
|
|
|
* |
121
|
|
|
* @return null|string |
122
|
|
|
*/ |
123
|
6 |
|
public function getThumb() |
124
|
|
|
{ |
125
|
6 |
|
$image = $this->node |
126
|
6 |
|
->filter('.item_imagePic .lazyload[data-imgsrc]') |
127
|
6 |
|
->first(); |
128
|
|
|
|
129
|
6 |
|
if (0 === $image->count()) { |
130
|
|
|
return null; |
131
|
|
|
} |
132
|
|
|
|
133
|
|
|
$src = $image |
134
|
6 |
|
->attr('data-imgsrc') |
135
|
3 |
|
; |
136
|
|
|
|
137
|
6 |
|
return (string)Http::createFromString($src)->withScheme('http'); |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
/** |
141
|
|
|
* Return the number of picture of the ad |
142
|
|
|
* |
143
|
|
|
* @return int |
144
|
|
|
*/ |
145
|
6 |
|
public function getNbImage() |
146
|
|
|
{ |
147
|
6 |
|
$node = $this->node->filter('.item_imageNumber'); |
148
|
|
|
|
149
|
|
|
return $this->getFieldValue($node, 0, function ($value) { |
150
|
6 |
|
return (int)trim($value); |
151
|
6 |
|
}); |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* @return mixed |
156
|
|
|
*/ |
157
|
6 |
View Code Duplication |
public function getPlacement() |
|
|
|
|
158
|
|
|
{ |
159
|
6 |
|
$node = $this->node->filter('*[itemprop=availableAtOrFrom]'); |
160
|
|
|
|
161
|
|
|
return $this->getFieldValue($node, '', function ($value) { |
162
|
6 |
|
return preg_replace('/\s+/', ' ', trim($value)); |
163
|
6 |
|
}); |
164
|
|
|
} |
165
|
|
|
|
166
|
|
|
/** |
167
|
|
|
* @return mixed |
168
|
|
|
*/ |
169
|
6 |
View Code Duplication |
public function getType() |
|
|
|
|
170
|
|
|
{ |
171
|
6 |
|
$node = $this->node->filter('*[itemprop=category]'); |
172
|
|
|
|
173
|
6 |
|
return $this->getFieldValue($node, false, function ($value) { |
174
|
6 |
|
if ('pro' === preg_replace('/[\s()]+/', '', $value)) { |
175
|
4 |
|
return 'pro'; |
176
|
|
|
} |
177
|
|
|
|
178
|
6 |
|
return 'part'; |
179
|
6 |
|
}); |
180
|
|
|
} |
181
|
|
|
|
182
|
6 |
|
public function getAll() |
183
|
|
|
{ |
184
|
|
|
return (object) [ |
185
|
6 |
|
'id' => $this->getId(), |
186
|
6 |
|
'title' => $this->getTitle(), |
187
|
6 |
|
'price' => $this->getPrice(), |
188
|
6 |
|
'url' => $this->getUrl(), |
189
|
6 |
|
'created_at' => $this->getCreatedAt(), |
190
|
6 |
|
'thumb' => $this->getThumb(), |
191
|
6 |
|
'nb_image' => $this->getNbImage(), |
192
|
6 |
|
'placement' => $this->getPlacement(), |
193
|
6 |
|
'type' => $this->getType(), |
194
|
3 |
|
]; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* Return the field's value |
199
|
|
|
* |
200
|
|
|
* @param $node |
201
|
|
|
* @param $defaultValue |
202
|
|
|
* @param $callback |
203
|
|
|
* @param string $funcName |
204
|
|
|
* @param string $funcParam |
205
|
|
|
* |
206
|
|
|
* @return mixed |
207
|
|
|
*/ |
208
|
6 |
|
private function getFieldValue( |
209
|
|
|
Crawler $node, |
210
|
|
|
$defaultValue, |
211
|
|
|
$callback, |
212
|
|
|
$funcName = 'text', |
213
|
|
|
$funcParam = '' |
214
|
|
|
) { |
215
|
6 |
|
if ($node->count()) { |
216
|
6 |
|
return $callback($node->$funcName($funcParam)); |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
return $defaultValue; |
220
|
|
|
} |
221
|
|
|
} |
222
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.