Passed
Push — main ( 537449...3825f2 )
by Dev
12:44
created

SERPExtractor::extractBusinessResults()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 29
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 19
nc 4
nop 0
dl 0
loc 29
rs 9.3222
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\Google\Extractor;
4
5
use LogicException;
6
use PiedWeb\Extractor\Helper;
7
use PiedWeb\Google\Result\BusinessResult;
8
use PiedWeb\Google\Result\SearchResult;
9
use Symfony\Component\DomCrawler\Crawler;
10
11
class SERPExtractor
12
{
13
    final public const SERP_FEATURE_SELECTORS = [
14
        'Ads' => ['.//*[@id="tads"]|.//*[@id="bottomads"]'],
15
        'ImagePack' => ["//span[text()='Images']", "//h3[starts-with(text(), 'Images correspondant')]"],
16
        'Local Pack' => ["//div[text()='Adresses']"],
17
        'PositionZero' => ["//h2[text()='Extrait optimisé sur le Web']"],
18
        'KnowledgePanel' => ['//div[contains(concat(" ",normalize-space(@class)," ")," kp-wholepage ")]'],
19
        'News' => ['//span[text()="À la une"]'],
20
        'PeolpleAlsoAsked' => ['//span[text()="Autres questions posées"]'],
21
        'Video' => ['//span[text()="Vidéos"]',            '//div[contains( @aria-label,"second")]'],
22
        'Reviews' => ['//span[contains( @aria-label,"Note")]'],
23
    ];
24
25
    /**
26
     * @var string[]
27
     */
28
    final public const RELATED = ["//a[@data-xbu][starts-with(@href, '/search')]/div/div/span"];
29
30
    /**
31
     * @var string[]
32
     */
33
    final public const RELATED_DESKTOP = ["//a[@data-xbu][starts-with(@href, '/search')]/div"];
34
35
    /** @var string */
36
    // public const RESULT_SELECTOR = '//a[@role="presentation"]/parent::div/parent::div/parent::div';
37
    final public const RESULT_SELECTOR = '(//h2[text()=\'Extrait optimisé sur le Web\']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"] ';
38
39
    // (//h2[text()='Extrait optimisé sur le Web']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"]
40
    /**
41
     * @var string
42
     */
43
    final public const RESULT_SELECTOR_DESKTOP =
44
        '//a[not(starts-with(@href, "/search"))]/parent::div/parent::div/parent::div[@data-hveid]
45
        |//a[not(starts-with(@href, "/search"))]/parent::div/parent::div/parent::div[@data-sokoban-container]';
46
47
    private readonly Crawler $domCrawler;
48
49
    /**
50
     * @var \PiedWeb\Google\Result\SearchResult[]|null
51
     */
52
    private ?array $results = null;
53
54
    public function __construct(public string $html, private int $extractedAt = 0)
55
    {
56
        $this->domCrawler = new Crawler($html);
0 ignored issues
show
Bug introduced by
The property domCrawler is declared read-only in PiedWeb\Google\Extractor\SERPExtractor.
Loading history...
57
        $this->extractedAt = 0 === $this->extractedAt ? (int) (new \DateTime('now'))->format('ymdHi') : $this->extractedAt;
58
    }
59
60
    private function isMobileSerp(): bool
61
    {
62
        return $this->exists([self::RESULT_SELECTOR]);
63
    }
64
65
    public function getNbrResults(): int
66
    {
67
        $node = null;
68
        if (! $this->exists(['//*[@id="resultStats"]|', '//*[@id="result-stats"]'], $node)) {
69
            return 0;
70
        }
71
72
        return (int) Helper::preg_replace_str('/[^0-9]/', '', $node->nodeValue ?? '');
73
    }
74
75
    /**
76
     * @return string[]
77
     */
78
    public function getAlsoAsked(): array
79
    {
80
        $alsoAsked = [];
81
        $nodes = $this->domCrawler->filterXpath('//div[@data-q]');
82
        foreach ($nodes as $node) {
83
            $alsoAsked[] = $node instanceof \DOMElement ? $node->getAttribute('data-q')
84
                : throw new \Exception();
85
        }
86
87
        return $alsoAsked;
88
    }
89
90
    /**
91
     * @return BusinessResult[]
92
     */
93
    public function extractBusinessResults(): array
94
    {
95
        $selector = '[data-rc_ludocids]';
96
97
        $nodes = $this->domCrawler->filter($selector);
98
        $mapsResults = [];
99
100
        $i = 0;
101
        foreach ($nodes as $node) {
102
            if (! $node instanceof \DOMElement) {
103
                continue;
104
            }
105
106
            $pI = $i - 1;
107
            if ($pI >= 0 && $node->getAttribute('data-rc_ludocids') === $mapsResults[$pI]->cid) {
108
                unset($mapsResults[$pI]);
109
                --$i;
110
            }
111
112
            $mapsResults[$i] = new BusinessResult();
113
            $mapsResults[$i]->cid = $node->getAttribute('data-rc_ludocids');
114
            $mapsResults[$i]->name = $this->extractBusinessName($node);
115
            $mapsResults[$i]->position = $i + 1;
116
            $mapsResults[$i]->organicPos = $i + 1;
117
            $mapsResults[$i]->pixelPos = $this->getPixelPosFor($node->getNodePath() ?? '');
118
            ++$i;
119
        }
120
121
        return $mapsResults;
122
    }
123
124
    private function extractBusinessName(\DOMElement $node): string
125
    {
126
        $nameNode = (new Crawler($node))->filter('span')->getNode(0);
127
128
        return null !== $nameNode ? $nameNode->textContent : '';
129
    }
130
131
    /**
132
     * @return SearchResult[]
133
     */
134
    public function getResults(bool $organicOnly = true): array
135
    {
136
        if (false === $organicOnly && null !== $this->results) {
137
            return $this->results;
138
        }
139
140
        $xpath = self::RESULT_SELECTOR;
141
        $nodes = $this->domCrawler->filterXpath($xpath);
142
        $toReturn = [];
143
144
        $i = 0;
145
        $iOrganic = 0;
146
147
        foreach ($nodes as $k => $node) {
148
            // skip if you are in ads
149
            $ads = null !== $nodes->eq($k)->closest('#tads, #bottomads');
150
            if ($organicOnly && $ads) {
151
                continue;
152
            }
153
154
            $result = $this->extractResultFrom($node, $ads);
155
            if (! $result instanceof \PiedWeb\Google\Result\SearchResult) {
156
                continue;
157
            }
158
159
            $toReturn[$i] = $result;
160
            $toReturn[$i]->organicPos = $ads ? 0 : $iOrganic + 1;
161
            $toReturn[$i]->position = $i + 1;
162
            ++$i;
163
            if (! $ads) {
164
                ++$iOrganic;
165
            }
166
        }
167
168
        if (false === $organicOnly) {
169
            $this->results = $toReturn;
170
        }
171
172
        return $toReturn;
173
    }
174
175
    private function extractResultFrom(\DOMNode $linkNode, bool $ads = false): ?SearchResult
176
    {
177
        // $domCrawler = new Crawler($node);
178
        // $linkNode = $domCrawler->filter('a')->getNode(0);
179
        if (! $linkNode instanceof \DOMElement) {
180
            throw new \Exception('Google changes his selector.');
181
        }
182
183
        // skip shopping Results
184
        if (str_starts_with($linkNode->getAttribute('href'), 'https://www.google.')) {
185
            return null;
186
        }
187
188
        if (str_starts_with($linkNode->getAttribute('href'), '/aclk?')) {
189
            return null;
190
        }
191
192
        $toReturn = new SearchResult();
193
        $toReturn->pixelPos = $this->getPixelPosFor($linkNode->getNodePath() ?? '');
194
        $toReturn->url = $linkNode->getAttribute('href');
195
        $toReturn->title = (new Crawler($linkNode))->text('');
196
        $toReturn->ads = $ads;
197
198
        return $toReturn;
199
    }
200
201
    protected function getPixelPosFor(string|\DOMNode $element): int
0 ignored issues
show
Unused Code introduced by
The parameter $element is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

201
    protected function getPixelPosFor(/** @scrutinizer ignore-unused */ string|\DOMNode $element): int

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
202
    {
203
        return 0;
204
    }
205
206
    public function containsSerpFeature(string $serpFeatureName, int &$pos = 0): bool
207
    {
208
        $xpaths = self::SERP_FEATURE_SELECTORS[$serpFeatureName];
209
        if (! $this->exists($xpaths)) {
210
            return false;
211
        }
212
213
        $pos = $this->getPixelPosFor($this->getNode($xpaths));
214
215
        return true;
216
    }
217
218
    /**
219
     * @return array<string, int>
220
     */
221
    public function getSerpFeatures(): array
222
    {
223
        $pos = 0;
224
        $result = [];
225
        foreach (array_keys(self::SERP_FEATURE_SELECTORS) as $serpFeatureName) {
226
            if ($this->containsSerpFeature($serpFeatureName, $pos)) {
227
                $result[$serpFeatureName] = $pos;
228
            }
229
        }
230
231
        return $result;
232
    }
233
234
    public function getPositionsZero(): SearchResult
235
    {
236
        $linkNodePositionZero = $this->domCrawler
237
            ->filterXPath("//h2[text()='Extrait optimisé sur le Web']/ancestor::block-component//a[@class]")
238
            ->getNode(0);
239
240
        if (! $linkNodePositionZero instanceof \DOMNode || ! $linkNodePositionZero instanceof \DOMElement) {
241
            file_put_contents('/tmp/debug.html', $this->html);
242
243
            throw new \LogicException('Google has changed its selector (position Zero)');
244
        }
245
246
        $toReturn = new SearchResult();
247
        $toReturn->position = 1; // not true
248
        $toReturn->organicPos = 1;
249
        $toReturn->pixelPos = $this->getPixelPosFor($linkNodePositionZero->getNodePath() ?? '');
250
        $toReturn->url = $linkNodePositionZero->getAttribute('href');
251
        $toReturn->title = $linkNodePositionZero->textContent;
252
253
        return $toReturn;
254
    }
255
256
    /**
257
     * @return string[]
258
     */
259
    public function getRelatedSearches(): array
260
    {
261
        $kw = [];
262
        $xpaths = $this->isMobileSerp() ? self::RELATED : self::RELATED_DESKTOP;
263
        foreach ($xpaths as $xpath) {
264
            $nodes = $this->domCrawler->filterXPath($xpath);
265
            foreach ($nodes as $node) {
266
                if ('' !== $node->textContent) {
267
                    $kw[] = $node->textContent;
268
                }
269
            }
270
        }
271
272
        return $kw;
273
    }
274
275
    /**
276
     * @param string[] $xpaths
277
     */
278
    public function exists(array $xpaths, ?\DOMNode &$node = null): bool
279
    {
280
        try {
281
            $node = $this->getNode($xpaths);
282
283
            return true;
284
        } catch (LogicException) {
285
            return false;
286
        }
287
    }
288
289
    /**
290
     * @param string[] $xpaths
291
     */
292
    public function getNode(array $xpaths): \DOMNode
293
    {
294
        foreach ($xpaths as $xpath) {
295
            $node = $this->domCrawler->filterXPath($xpath)->getNode(0);
296
            if (! $node instanceof \DOMNode) {
297
                continue;
298
            }
299
300
            if ('' === $node->nodeValue) {
301
                continue;
302
            }
303
304
            return $node;
305
        }
306
307
        throw new \LogicException('`'.implode('`, ', $xpaths).'` not found');
308
    }
309
310
    public function toJson(): string
311
    {
312
        return \Safe\json_encode([
313
            'version' => '1',
314
            'extractedAt' => $this->extractedAt,
315
            'resultStat' => $this->getNbrResults(),
316
            'serpFeatures' => $this->getSerpFeatures(),
317
            'relatedSearches' => $this->getRelatedSearches(),
318
            'results' => $this->getResults(false),
319
            'alsoAsked' => $this->getAlsoAsked(),
320
            'businessResults' => $this->extractBusinessResults(),
321
        ]);
322
    }
323
}
324