1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\Google\Extractor; |
4
|
|
|
|
5
|
|
|
use LogicException; |
6
|
|
|
use PiedWeb\Extractor\Helper; |
7
|
|
|
use PiedWeb\Google\Result\BusinessResult; |
8
|
|
|
use PiedWeb\Google\Result\SearchResult; |
9
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
10
|
|
|
|
11
|
|
|
class SERPExtractor |
12
|
|
|
{ |
13
|
|
|
final public const SERP_FEATURE_SELECTORS = [ |
14
|
|
|
'Ads' => ['.//*[@id="tads"]|.//*[@id="bottomads"]'], |
15
|
|
|
'ImagePack' => ["//span[text()='Images']", "//h3[starts-with(text(), 'Images correspondant')]"], |
16
|
|
|
'Local Pack' => ["//div[text()='Adresses']"], |
17
|
|
|
'PositionZero' => ["//h2[text()='Extrait optimisé sur le Web']"], |
18
|
|
|
'KnowledgePanel' => ['//div[contains(concat(" ",normalize-space(@class)," ")," kp-wholepage ")]'], |
19
|
|
|
'News' => ['//span[text()="À la une"]'], |
20
|
|
|
'PeolpleAlsoAsked' => ['//span[text()="Autres questions posées"]'], |
21
|
|
|
'Video' => ['//span[text()="Vidéos"]', '//div[contains( @aria-label,"second")]'], |
22
|
|
|
'Reviews' => ['//span[contains( @aria-label,"Note")]'], |
23
|
|
|
]; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* @var string[] |
27
|
|
|
*/ |
28
|
|
|
final public const RELATED = ["//a[@data-xbu][starts-with(@href, '/search')]/div/div/span"]; |
29
|
|
|
|
30
|
|
|
/** |
31
|
|
|
* @var string[] |
32
|
|
|
*/ |
33
|
|
|
final public const RELATED_DESKTOP = ["//a[@data-xbu][starts-with(@href, '/search')]/div"]; |
34
|
|
|
|
35
|
|
|
/** @var string */ |
36
|
|
|
// public const RESULT_SELECTOR = '//a[@role="presentation"]/parent::div/parent::div/parent::div'; |
37
|
|
|
final public const RESULT_SELECTOR = '(//h2[text()=\'Extrait optimisé sur le Web\']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"] '; |
38
|
|
|
|
39
|
|
|
// (//h2[text()='Extrait optimisé sur le Web']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"] |
40
|
|
|
/** |
41
|
|
|
* @var string |
42
|
|
|
*/ |
43
|
|
|
final public const RESULT_SELECTOR_DESKTOP = |
44
|
|
|
'//a[not(starts-with(@href, "/search"))]/parent::div/parent::div/parent::div[@data-hveid] |
45
|
|
|
|//a[not(starts-with(@href, "/search"))]/parent::div/parent::div/parent::div[@data-sokoban-container]'; |
46
|
|
|
|
47
|
|
|
private readonly Crawler $domCrawler; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @var \PiedWeb\Google\Result\SearchResult[]|null |
51
|
|
|
*/ |
52
|
|
|
private ?array $results = null; |
53
|
|
|
|
54
|
|
|
public function __construct(public string $html, private int $extractedAt = 0) |
55
|
|
|
{ |
56
|
|
|
$this->domCrawler = new Crawler($html); |
|
|
|
|
57
|
|
|
$this->extractedAt = 0 === $this->extractedAt ? (int) (new \DateTime('now'))->format('ymdHi') : $this->extractedAt; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
private function isMobileSerp(): bool |
61
|
|
|
{ |
62
|
|
|
return $this->exists([self::RESULT_SELECTOR]); |
63
|
|
|
} |
64
|
|
|
|
65
|
|
|
public function getNbrResults(): int |
66
|
|
|
{ |
67
|
|
|
$node = null; |
68
|
|
|
if (! $this->exists(['//*[@id="resultStats"]|', '//*[@id="result-stats"]'], $node)) { |
69
|
|
|
return 0; |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
return (int) Helper::preg_replace_str('/[^0-9]/', '', $node->nodeValue ?? ''); |
73
|
|
|
} |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* @return string[] |
77
|
|
|
*/ |
78
|
|
|
public function getAlsoAsked(): array |
79
|
|
|
{ |
80
|
|
|
$alsoAsked = []; |
81
|
|
|
$nodes = $this->domCrawler->filterXpath('//div[@data-q]'); |
82
|
|
|
foreach ($nodes as $node) { |
83
|
|
|
$alsoAsked[] = $node instanceof \DOMElement ? $node->getAttribute('data-q') |
84
|
|
|
: throw new \Exception(); |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
return $alsoAsked; |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
/** |
91
|
|
|
* @return BusinessResult[] |
92
|
|
|
*/ |
93
|
|
|
public function extractBusinessResults(): array |
94
|
|
|
{ |
95
|
|
|
$selector = '[data-rc_ludocids]'; |
96
|
|
|
|
97
|
|
|
$nodes = $this->domCrawler->filter($selector); |
98
|
|
|
$mapsResults = []; |
99
|
|
|
|
100
|
|
|
$i = 0; |
101
|
|
|
foreach ($nodes as $node) { |
102
|
|
|
if (! $node instanceof \DOMElement) { |
103
|
|
|
continue; |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
$pI = $i - 1; |
107
|
|
|
if ($pI >= 0 && $node->getAttribute('data-rc_ludocids') === $mapsResults[$pI]->cid) { |
108
|
|
|
unset($mapsResults[$pI]); |
109
|
|
|
--$i; |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
$mapsResults[$i] = new BusinessResult(); |
113
|
|
|
$mapsResults[$i]->cid = $node->getAttribute('data-rc_ludocids'); |
114
|
|
|
$mapsResults[$i]->name = $this->extractBusinessName($node); |
115
|
|
|
$mapsResults[$i]->position = $i + 1; |
116
|
|
|
$mapsResults[$i]->organicPos = $i + 1; |
117
|
|
|
$mapsResults[$i]->pixelPos = $this->getPixelPosFor($node->getNodePath() ?? ''); |
118
|
|
|
++$i; |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
return $mapsResults; |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
private function extractBusinessName(\DOMElement $node): string |
125
|
|
|
{ |
126
|
|
|
$nameNode = (new Crawler($node))->filter('span')->getNode(0); |
127
|
|
|
|
128
|
|
|
return null !== $nameNode ? $nameNode->textContent : ''; |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
/** |
132
|
|
|
* @return SearchResult[] |
133
|
|
|
*/ |
134
|
|
|
public function getResults(bool $organicOnly = true): array |
135
|
|
|
{ |
136
|
|
|
if (false === $organicOnly && null !== $this->results) { |
137
|
|
|
return $this->results; |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
$xpath = self::RESULT_SELECTOR; |
141
|
|
|
$nodes = $this->domCrawler->filterXpath($xpath); |
142
|
|
|
$toReturn = []; |
143
|
|
|
|
144
|
|
|
$i = 0; |
145
|
|
|
$iOrganic = 0; |
146
|
|
|
|
147
|
|
|
foreach ($nodes as $k => $node) { |
148
|
|
|
// skip if you are in ads |
149
|
|
|
$ads = null !== $nodes->eq($k)->closest('#tads, #bottomads'); |
150
|
|
|
if ($organicOnly && $ads) { |
151
|
|
|
continue; |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
$result = $this->extractResultFrom($node, $ads); |
155
|
|
|
if (! $result instanceof \PiedWeb\Google\Result\SearchResult) { |
156
|
|
|
continue; |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
$toReturn[$i] = $result; |
160
|
|
|
$toReturn[$i]->organicPos = $ads ? 0 : $iOrganic + 1; |
161
|
|
|
$toReturn[$i]->position = $i + 1; |
162
|
|
|
++$i; |
163
|
|
|
if (! $ads) { |
164
|
|
|
++$iOrganic; |
165
|
|
|
} |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
if (false === $organicOnly) { |
169
|
|
|
$this->results = $toReturn; |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
return $toReturn; |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
private function extractResultFrom(\DOMNode $linkNode, bool $ads = false): ?SearchResult |
176
|
|
|
{ |
177
|
|
|
// $domCrawler = new Crawler($node); |
178
|
|
|
// $linkNode = $domCrawler->filter('a')->getNode(0); |
179
|
|
|
if (! $linkNode instanceof \DOMElement) { |
180
|
|
|
throw new \Exception('Google changes his selector.'); |
181
|
|
|
} |
182
|
|
|
|
183
|
|
|
// skip shopping Results |
184
|
|
|
if (str_starts_with($linkNode->getAttribute('href'), 'https://www.google.')) { |
185
|
|
|
return null; |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
if (str_starts_with($linkNode->getAttribute('href'), '/aclk?')) { |
189
|
|
|
return null; |
190
|
|
|
} |
191
|
|
|
|
192
|
|
|
$toReturn = new SearchResult(); |
193
|
|
|
$toReturn->pixelPos = $this->getPixelPosFor($linkNode->getNodePath() ?? ''); |
194
|
|
|
$toReturn->url = $linkNode->getAttribute('href'); |
195
|
|
|
$toReturn->title = (new Crawler($linkNode))->text(''); |
196
|
|
|
$toReturn->ads = $ads; |
197
|
|
|
|
198
|
|
|
return $toReturn; |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
protected function getPixelPosFor(string|\DOMNode $element): int |
|
|
|
|
202
|
|
|
{ |
203
|
|
|
return 0; |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
public function containsSerpFeature(string $serpFeatureName, int &$pos = 0): bool |
207
|
|
|
{ |
208
|
|
|
$xpaths = self::SERP_FEATURE_SELECTORS[$serpFeatureName]; |
209
|
|
|
if (! $this->exists($xpaths)) { |
210
|
|
|
return false; |
211
|
|
|
} |
212
|
|
|
|
213
|
|
|
$pos = $this->getPixelPosFor($this->getNode($xpaths)); |
214
|
|
|
|
215
|
|
|
return true; |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
/** |
219
|
|
|
* @return array<string, int> |
220
|
|
|
*/ |
221
|
|
|
public function getSerpFeatures(): array |
222
|
|
|
{ |
223
|
|
|
$pos = 0; |
224
|
|
|
$result = []; |
225
|
|
|
foreach (array_keys(self::SERP_FEATURE_SELECTORS) as $serpFeatureName) { |
226
|
|
|
if ($this->containsSerpFeature($serpFeatureName, $pos)) { |
227
|
|
|
$result[$serpFeatureName] = $pos; |
228
|
|
|
} |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
return $result; |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
public function getPositionsZero(): SearchResult |
235
|
|
|
{ |
236
|
|
|
$linkNodePositionZero = $this->domCrawler |
237
|
|
|
->filterXPath("//h2[text()='Extrait optimisé sur le Web']/ancestor::block-component//a[@class]") |
238
|
|
|
->getNode(0); |
239
|
|
|
|
240
|
|
|
if (! $linkNodePositionZero instanceof \DOMNode || ! $linkNodePositionZero instanceof \DOMElement) { |
241
|
|
|
file_put_contents('/tmp/debug.html', $this->html); |
242
|
|
|
|
243
|
|
|
throw new \LogicException('Google has changed its selector (position Zero)'); |
244
|
|
|
} |
245
|
|
|
|
246
|
|
|
$toReturn = new SearchResult(); |
247
|
|
|
$toReturn->position = 1; // not true |
248
|
|
|
$toReturn->organicPos = 1; |
249
|
|
|
$toReturn->pixelPos = $this->getPixelPosFor($linkNodePositionZero->getNodePath() ?? ''); |
250
|
|
|
$toReturn->url = $linkNodePositionZero->getAttribute('href'); |
251
|
|
|
$toReturn->title = $linkNodePositionZero->textContent; |
252
|
|
|
|
253
|
|
|
return $toReturn; |
254
|
|
|
} |
255
|
|
|
|
256
|
|
|
/** |
257
|
|
|
* @return string[] |
258
|
|
|
*/ |
259
|
|
|
public function getRelatedSearches(): array |
260
|
|
|
{ |
261
|
|
|
$kw = []; |
262
|
|
|
$xpaths = $this->isMobileSerp() ? self::RELATED : self::RELATED_DESKTOP; |
263
|
|
|
foreach ($xpaths as $xpath) { |
264
|
|
|
$nodes = $this->domCrawler->filterXPath($xpath); |
265
|
|
|
foreach ($nodes as $node) { |
266
|
|
|
if ('' !== $node->textContent) { |
267
|
|
|
$kw[] = $node->textContent; |
268
|
|
|
} |
269
|
|
|
} |
270
|
|
|
} |
271
|
|
|
|
272
|
|
|
return $kw; |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
/** |
276
|
|
|
* @param string[] $xpaths |
277
|
|
|
*/ |
278
|
|
|
public function exists(array $xpaths, ?\DOMNode &$node = null): bool |
279
|
|
|
{ |
280
|
|
|
try { |
281
|
|
|
$node = $this->getNode($xpaths); |
282
|
|
|
|
283
|
|
|
return true; |
284
|
|
|
} catch (LogicException) { |
285
|
|
|
return false; |
286
|
|
|
} |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
/** |
290
|
|
|
* @param string[] $xpaths |
291
|
|
|
*/ |
292
|
|
|
public function getNode(array $xpaths): \DOMNode |
293
|
|
|
{ |
294
|
|
|
foreach ($xpaths as $xpath) { |
295
|
|
|
$node = $this->domCrawler->filterXPath($xpath)->getNode(0); |
296
|
|
|
if (! $node instanceof \DOMNode) { |
297
|
|
|
continue; |
298
|
|
|
} |
299
|
|
|
|
300
|
|
|
if ('' === $node->nodeValue) { |
301
|
|
|
continue; |
302
|
|
|
} |
303
|
|
|
|
304
|
|
|
return $node; |
305
|
|
|
} |
306
|
|
|
|
307
|
|
|
throw new \LogicException('`'.implode('`, ', $xpaths).'` not found'); |
308
|
|
|
} |
309
|
|
|
|
310
|
|
|
public function toJson(): string |
311
|
|
|
{ |
312
|
|
|
return \Safe\json_encode([ |
313
|
|
|
'version' => '1', |
314
|
|
|
'extractedAt' => $this->extractedAt, |
315
|
|
|
'resultStat' => $this->getNbrResults(), |
316
|
|
|
'serpFeatures' => $this->getSerpFeatures(), |
317
|
|
|
'relatedSearches' => $this->getRelatedSearches(), |
318
|
|
|
'results' => $this->getResults(false), |
319
|
|
|
'alsoAsked' => $this->getAlsoAsked(), |
320
|
|
|
'businessResults' => $this->extractBusinessResults(), |
321
|
|
|
]); |
322
|
|
|
} |
323
|
|
|
} |
324
|
|
|
|