Passed
Push — master ( 641929...562012 )
by Alexey
06:04 queued 12s
created

ScraperUtil::extractScriptData()   A

Complexity

Conditions 4
Paths 2

Size

Total Lines 22
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 4

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 12
c 1
b 0
f 0
dl 0
loc 22
ccs 11
cts 11
cp 1
rs 9.8666
cc 4
nc 2
nop 1
crap 4
1
<?php
2
3
declare(strict_types=1);
4
5
/*
6
 * Copyright (c) Ne-Lexa
7
 *
8
 * For the full copyright and license information, please view
9
 * the LICENSE file that was distributed with this source code.
10
 *
11
 * @see https://github.com/Ne-Lexa/google-play-scraper
12
 */
13
14
namespace Nelexa\GPlay\Util;
15
16
/**
17
 * @internal
18
 */
19
class ScraperUtil
20
{
21
    /**
22
     * @param string $html
23
     *
24
     * @return array
25
     */
26 47
    public static function extractScriptData(string $html): array
27
    {
28 47
        $scripts = [];
29
30 47
        if (preg_match_all('/>AF_initDataCallback\((.*?)\);<\/script/s', $html, $matches)) {
31 47
            $scripts = array_reduce(
32 47
                $matches[0],
33 47
                static function ($carry, $item) {
34
                    if (
35 47
                        preg_match("/(ds:.*?)'/", $item, $keyMatch)
36 47
                        && preg_match('/data:([\s\S]*?)(, }\);<\/|, sideChannel:)/', $item, $valueMatch)
37
                    ) {
38 47
                        $carry[$keyMatch[1]] = \GuzzleHttp\json_decode($valueMatch[1], true);
0 ignored issues
show
Deprecated Code introduced by
The function GuzzleHttp\json_decode() has been deprecated: json_decode will be removed in guzzlehttp/guzzle:8.0. Use Utils::jsonDecode instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

38
                        $carry[$keyMatch[1]] = /** @scrutinizer ignore-deprecated */ \GuzzleHttp\json_decode($valueMatch[1], true);

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
39
                    }
40
41 47
                    return $carry;
42
                },
43
                $scripts
44
            );
45
        }
46
47 47
        return $scripts;
48
    }
49
50
    /**
51
     * @param string $html
52
     *
53
     * @return \DOMDocument
54
     */
55 36
    public static function createDomDocument(string $html): \DOMDocument
56
    {
57 36
        $doc = new \DOMDocument();
58 36
        $internalErrors = libxml_use_internal_errors(true);
59
60 36
        if (!$doc->loadHTML('<?xml encoding="utf-8"?>' . $html)) {
61
            throw new \RuntimeException(
62
                'error load html: ' . $html
63
            );
64
        }
65 36
        libxml_use_internal_errors($internalErrors);
66
67 36
        return $doc;
68
    }
69
70
    /**
71
     * @param string $html
72
     *
73
     * @return string
74
     */
75 36
    public static function html2text(string $html): string
76
    {
77 36
        $doc = self::createDomDocument($html);
78 36
        $text = self::convertDomNodeToText($doc);
79 36
        $text = preg_replace('/\n{3,}/', "\n\n", trim($text));
80
81 36
        return trim($text);
82
    }
83
84
    /**
85
     * @param \DOMNode $node
86
     *
87
     * @return string
88
     */
89 36
    private static function convertDomNodeToText(\DOMNode $node): string
90
    {
91 36
        if ($node instanceof \DOMText) {
92 36
            $text = preg_replace('/\s+/', ' ', $node->wholeText);
93
        } else {
94 36
            $text = '';
95
96 36
            if ($node->childNodes !== null) {
97 36
                foreach ($node->childNodes as $childNode) {
98 36
                    $text .= self::convertDomNodeToText($childNode);
99
                }
100
            }
101
102 36
            switch ($node->nodeName) {
103 36
                case 'h1':
104 36
                case 'h2':
105 36
                case 'h3':
106 36
                case 'h4':
107 36
                case 'h5':
108 36
                case 'h6':
109 36
                case 'p':
110 36
                case 'ul':
111 36
                case 'div':
112 36
                    $text = "\n\n" . $text . "\n\n";
113 36
                    break;
114
115 36
                case 'li':
116
                    $text = '- ' . $text . "\n";
117
                    break;
118
119 36
                case 'br':
120 20
                    $text .= "\n";
121 20
                    break;
122
            }
123
        }
124
125 36
        return $text;
126
    }
127
}
128