|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace App\Parser; |
|
4
|
|
|
|
|
5
|
|
|
use App\DataProvider\ProviderProvider; |
|
6
|
|
|
use App\DTO\Property; |
|
7
|
|
|
use App\DTO\PropertyAd; |
|
8
|
|
|
use App\Enum\PropertyFilter; |
|
9
|
|
|
use App\Exception\ParseException; |
|
10
|
|
|
use App\Util\NumericUtil; |
|
11
|
|
|
use DateTime; |
|
12
|
|
|
use Exception; |
|
13
|
|
|
use Psr\Log\LoggerInterface; |
|
14
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
|
15
|
|
|
use function array_filter; |
|
16
|
|
|
use function Symfony\Component\String\u; |
|
17
|
|
|
|
|
18
|
|
|
abstract class AbstractParser implements ParserInterface |
|
19
|
|
|
{ |
|
20
|
|
|
// Redefined in the child classes |
|
21
|
|
|
protected const PROVIDER = null; |
|
22
|
|
|
|
|
23
|
|
|
protected const SELECTOR_AD_WRAPPER = null; |
|
24
|
|
|
protected const SELECTOR_PRICE = null; |
|
25
|
|
|
protected const SELECTOR_AREA = null; |
|
26
|
|
|
protected const SELECTOR_ROOMS_COUNT = null; |
|
27
|
|
|
protected const SELECTOR_LOCATION = null; |
|
28
|
|
|
protected const SELECTOR_BUILDING_NAME = null; |
|
29
|
|
|
protected const SELECTOR_TITLE = null; |
|
30
|
|
|
protected const SELECTOR_DESCRIPTION = null; |
|
31
|
|
|
protected const SELECTOR_PHOTO = 'img:first-child'; |
|
32
|
|
|
protected const SELECTOR_URL = 'a:first-child'; |
|
33
|
|
|
|
|
34
|
|
|
private const NEW_BUILD_KEYWORDS = ['neuf', 'livraison', 'programme', 'neuve', 'nouveau', 'nouvelle', 'remise']; |
|
35
|
|
|
|
|
36
|
17 |
|
public function __construct( |
|
37
|
|
|
private ProviderProvider $providerProvider, |
|
38
|
|
|
protected LoggerInterface $logger |
|
39
|
17 |
|
) {} |
|
40
|
|
|
|
|
41
|
|
|
/** |
|
42
|
|
|
* {@inheritDoc} |
|
43
|
|
|
*/ |
|
44
|
17 |
|
public function parse(string $html, array $filters = [], array $params = []): array |
|
45
|
|
|
{ |
|
46
|
17 |
|
$properties = []; |
|
47
|
|
|
|
|
48
|
|
|
// Iterate over all DOM elements wrapping a property ad |
|
49
|
17 |
|
($this->createCrawler($html))->filter(static::SELECTOR_AD_WRAPPER)->each(function (Crawler $node) use (&$properties, $params) { |
|
|
|
|
|
|
50
|
|
|
try { |
|
51
|
17 |
|
$properties[] = $this->parseOne($node, $params['date']); |
|
52
|
|
|
} catch (Exception $e) { |
|
53
|
|
|
$this->logger->warning('Error while parsing a property: ' . $e->getMessage(), $params); |
|
54
|
|
|
} |
|
55
|
17 |
|
}); |
|
56
|
|
|
|
|
57
|
17 |
|
if (empty($properties)) { |
|
58
|
|
|
throw new ParseException('No property parsed'); |
|
59
|
|
|
} |
|
60
|
|
|
|
|
61
|
|
|
// Filter the properties |
|
62
|
17 |
|
return array_filter($properties, static fn(Property $ad) => isset($filters[PropertyFilter::NEW_BUILD]) ? $ad->isNewBuild() : true); |
|
63
|
|
|
} |
|
64
|
|
|
|
|
65
|
|
|
/** |
|
66
|
|
|
* Enable to modify the DOM before parsing |
|
67
|
|
|
* |
|
68
|
|
|
* @param string $html |
|
69
|
|
|
* |
|
70
|
|
|
* @return Crawler |
|
71
|
|
|
*/ |
|
72
|
15 |
|
protected function createCrawler(string $html): Crawler |
|
73
|
|
|
{ |
|
74
|
15 |
|
return new Crawler($html); |
|
75
|
|
|
} |
|
76
|
|
|
|
|
77
|
15 |
|
protected function parsePrice(Crawler $crawler): ?float |
|
78
|
|
|
{ |
|
79
|
15 |
|
if (null === static::SELECTOR_PRICE) { |
|
|
|
|
|
|
80
|
14 |
|
return NumericUtil::parsePrice($crawler->html()); |
|
81
|
|
|
} |
|
82
|
|
|
|
|
83
|
|
|
try { |
|
84
|
1 |
|
$priceStr = trim($crawler->filter(static::SELECTOR_PRICE)->text()); |
|
85
|
|
|
} catch (Exception) { |
|
86
|
|
|
return null; |
|
87
|
|
|
} |
|
88
|
|
|
|
|
89
|
1 |
|
return NumericUtil::parsePrice($priceStr); |
|
90
|
|
|
} |
|
91
|
|
|
|
|
92
|
17 |
|
protected function parseArea(Crawler $crawler): ?float |
|
93
|
|
|
{ |
|
94
|
17 |
|
if (null === static::SELECTOR_AREA) { |
|
|
|
|
|
|
95
|
17 |
|
return NumericUtil::parseArea($crawler->html()); |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
|
|
try { |
|
99
|
|
|
$areaStr = trim($crawler->filter(static::SELECTOR_AREA)->text()); |
|
100
|
|
|
} catch (Exception) { |
|
101
|
|
|
return null; |
|
102
|
|
|
} |
|
103
|
|
|
|
|
104
|
|
|
return NumericUtil::parseArea($areaStr); |
|
105
|
|
|
} |
|
106
|
|
|
|
|
107
|
17 |
|
protected function parseRoomsCount(Crawler $crawler): ?int |
|
108
|
|
|
{ |
|
109
|
17 |
|
if (null === static::SELECTOR_ROOMS_COUNT) { |
|
|
|
|
|
|
110
|
16 |
|
return NumericUtil::parseRoomsCount($crawler->html()); |
|
111
|
|
|
} |
|
112
|
|
|
|
|
113
|
|
|
try { |
|
114
|
1 |
|
$roomsCountStr = trim($crawler->filter(static::SELECTOR_ROOMS_COUNT)->text()); |
|
115
|
|
|
} catch (Exception) { |
|
116
|
|
|
return null; |
|
117
|
|
|
} |
|
118
|
|
|
|
|
119
|
1 |
|
return NumericUtil::parseRoomsCount($roomsCountStr); |
|
120
|
|
|
} |
|
121
|
|
|
|
|
122
|
15 |
|
protected function parseLocation(Crawler $crawler): ?string |
|
123
|
|
|
{ |
|
124
|
15 |
|
if (null === static::SELECTOR_LOCATION) { |
|
|
|
|
|
|
125
|
1 |
|
return null; |
|
126
|
|
|
} |
|
127
|
|
|
|
|
128
|
|
|
try { |
|
129
|
14 |
|
return trim($crawler->filter(static::SELECTOR_LOCATION)->text()); |
|
130
|
|
|
} catch (Exception) { |
|
131
|
|
|
return null; |
|
132
|
|
|
} |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
17 |
|
protected function parseBuildingName(Crawler $crawler): ?string |
|
136
|
|
|
{ |
|
137
|
17 |
|
if (null === static::SELECTOR_BUILDING_NAME) { |
|
|
|
|
|
|
138
|
10 |
|
return null; |
|
139
|
|
|
} |
|
140
|
|
|
|
|
141
|
|
|
try { |
|
142
|
7 |
|
return trim($crawler->filter(static::SELECTOR_BUILDING_NAME)->text()); |
|
143
|
|
|
} catch (Exception) { |
|
144
|
|
|
return null; |
|
145
|
|
|
} |
|
146
|
|
|
} |
|
147
|
|
|
|
|
148
|
17 |
|
protected function parseTitle(Crawler $crawler): ?string |
|
149
|
|
|
{ |
|
150
|
17 |
|
if (null === static::SELECTOR_TITLE) { |
|
|
|
|
|
|
151
|
14 |
|
return null; |
|
152
|
|
|
} |
|
153
|
|
|
|
|
154
|
|
|
try { |
|
155
|
3 |
|
return trim($crawler->filter(static::SELECTOR_TITLE)->text()); |
|
156
|
|
|
} catch (Exception) { |
|
157
|
|
|
return null; |
|
158
|
|
|
} |
|
159
|
|
|
} |
|
160
|
|
|
|
|
161
|
17 |
|
protected function parseDescription(Crawler $crawler): ?string |
|
162
|
|
|
{ |
|
163
|
17 |
|
if (null === static::SELECTOR_DESCRIPTION) { |
|
|
|
|
|
|
164
|
10 |
|
return null; |
|
165
|
|
|
} |
|
166
|
|
|
|
|
167
|
|
|
try { |
|
168
|
7 |
|
return trim($crawler->filter(static::SELECTOR_DESCRIPTION)->text()); |
|
169
|
1 |
|
} catch (Exception) { |
|
170
|
1 |
|
return null; |
|
171
|
|
|
} |
|
172
|
|
|
} |
|
173
|
|
|
|
|
174
|
|
|
/** |
|
175
|
|
|
* @throws ParseException |
|
176
|
|
|
*/ |
|
177
|
15 |
|
protected function parsePhoto(Crawler $crawler): ?string |
|
178
|
|
|
{ |
|
179
|
|
|
try { |
|
180
|
15 |
|
return $crawler->filter(static::SELECTOR_PHOTO)->attr('src'); |
|
181
|
|
|
} catch (Exception $e) { |
|
182
|
|
|
throw new ParseException('Error while parsing the photo: ' . $e->getMessage()); |
|
183
|
|
|
} |
|
184
|
|
|
} |
|
185
|
|
|
|
|
186
|
|
|
/** |
|
187
|
|
|
* @throws ParseException |
|
188
|
|
|
*/ |
|
189
|
17 |
|
protected function parseUrl(Crawler $crawler): string |
|
190
|
|
|
{ |
|
191
|
|
|
try { |
|
192
|
17 |
|
return $crawler->filter(static::SELECTOR_URL)->attr('href'); |
|
|
|
|
|
|
193
|
|
|
} catch (Exception $e) { |
|
194
|
|
|
throw new ParseException('Error while parsing the URL: ' . $e->getMessage()); |
|
195
|
|
|
} |
|
196
|
|
|
} |
|
197
|
|
|
|
|
198
|
|
|
/** |
|
199
|
|
|
* @throws ParseException |
|
200
|
|
|
*/ |
|
201
|
17 |
|
private function parseOne(Crawler $crawler, DateTime $publishedAt): Property |
|
202
|
|
|
{ |
|
203
|
17 |
|
$propertyAd = (new PropertyAd) |
|
204
|
17 |
|
->setProvider(static::PROVIDER) |
|
|
|
|
|
|
205
|
17 |
|
->setTitle($this->parseTitle($crawler)) |
|
|
|
|
|
|
206
|
17 |
|
->setDescription($this->parseDescription($crawler)) |
|
|
|
|
|
|
207
|
17 |
|
->setPhoto($this->parsePhoto($crawler)) |
|
|
|
|
|
|
208
|
17 |
|
->setUrl($this->parseUrl($crawler)) |
|
209
|
17 |
|
->setPublishedAt($publishedAt); |
|
210
|
|
|
|
|
211
|
17 |
|
$property = (new Property) |
|
212
|
17 |
|
->setPrice($this->parsePrice($crawler)) |
|
213
|
17 |
|
->setArea($this->parseArea($crawler)) |
|
214
|
17 |
|
->setRoomsCount($this->parseRoomsCount($crawler)) |
|
215
|
17 |
|
->setLocation($this->parseLocation($crawler)) |
|
|
|
|
|
|
216
|
17 |
|
->setBuildingName($this->parseBuildingName($crawler)) |
|
|
|
|
|
|
217
|
17 |
|
->setAd($propertyAd); |
|
218
|
|
|
|
|
219
|
17 |
|
if ((null !== $provider = $this->providerProvider->find(static::PROVIDER)) && $provider->isNewBuildOnly()) { |
|
|
|
|
|
|
220
|
6 |
|
$property->setNewBuild(true); |
|
221
|
|
|
} else { |
|
222
|
11 |
|
$property->setNewBuild(u($propertyAd->getTitle() . $propertyAd->getDescription())->containsAny(self::NEW_BUILD_KEYWORDS)); |
|
223
|
|
|
} |
|
224
|
|
|
|
|
225
|
17 |
|
return $property; |
|
226
|
|
|
} |
|
227
|
|
|
} |
|
228
|
|
|
|