ScraperUtil   A
last analyzed

Complexity

Total Complexity 27

Size/Duplication

Total Lines 134
Duplicated Lines 0 %

Test Coverage

Coverage 95%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 27
eloc 60
c 1
b 0
f 0
dl 0
loc 134
ccs 57
cts 60
cp 0.95
rs 10

5 Methods

Rating   Name   Duplication   Size   Complexity  
A html2text() 0 7 1
C convertDomNodeToText() 0 37 15
A extractScriptData() 0 23 4
A createDomDocument() 0 13 2
A getValue() 0 17 5
1
<?php
2
3
declare(strict_types=1);
4
5
/*
6
 * Copyright (c) Ne-Lexa
7
 *
8
 * For the full copyright and license information, please view
9
 * the LICENSE file that was distributed with this source code.
10
 *
11
 * @see https://github.com/Ne-Lexa/google-play-scraper
12
 */
13
14
namespace Nelexa\GPlay\Util;
15
16
/**
17
 * @internal
18
 */
19
class ScraperUtil
20
{
21
    /**
22
     * @param string $html
23
     *
24
     * @return array
25
     */
26 25
    public static function extractScriptData(string $html): array
27
    {
28 25
        $scripts = [];
29
30 25
        preg_match_all('/>AF_initDataCallback\((.*?)\);<\/script/s', $html, $matches);
31 25
        if ($matches) {
32 25
            $scripts = array_reduce(
33 25
                $matches[0],
34 25
                static function ($carry, $item) {
35
                    if (
36 25
                        preg_match("/(ds:.*?)'/", $item, $keyMatch)
37 25
                        && preg_match('/data:([\s\S]*?)(, }\);<\/|, sideChannel:)/', $item, $valueMatch)
38
                    ) {
39 25
                        $carry[$keyMatch[1]] = \GuzzleHttp\json_decode($valueMatch[1], true);
0 ignored issues
show
Deprecated Code introduced by
The function GuzzleHttp\json_decode() has been deprecated: json_decode will be removed in guzzlehttp/guzzle:8.0. Use Utils::jsonDecode instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

39
                        $carry[$keyMatch[1]] = /** @scrutinizer ignore-deprecated */ \GuzzleHttp\json_decode($valueMatch[1], true);

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
40
                    }
41
42 25
                    return $carry;
43
                },
44
                $scripts
45
            );
46
        }
47
48 25
        return $scripts;
49
    }
50
51
    /**
52
     * @param string $html
53
     *
54
     * @return \DOMDocument
55
     */
56 32
    public static function createDomDocument(string $html): \DOMDocument
57
    {
58 32
        $doc = new \DOMDocument();
59 32
        $internalErrors = libxml_use_internal_errors(true);
60
61 32
        if (!$doc->loadHTML('<?xml encoding="utf-8"?>' . $html)) {
62
            throw new \RuntimeException(
63
                'error load html: ' . $html
64
            );
65
        }
66 32
        libxml_use_internal_errors($internalErrors);
67
68 32
        return $doc;
69
    }
70
71
    /**
72
     * @param string $html
73
     *
74
     * @return string
75
     */
76 32
    public static function html2text(string $html): string
77
    {
78 32
        $doc = self::createDomDocument($html);
79 32
        $text = self::convertDomNodeToText($doc);
80 32
        $text = preg_replace('/\n{3,}/', "\n\n", trim($text));
81
82 32
        return trim($text);
83
    }
84
85
    /**
86
     * @param \DOMNode $node
87
     *
88
     * @return string
89
     */
90 32
    private static function convertDomNodeToText(\DOMNode $node): string
91
    {
92 32
        if ($node instanceof \DOMText) {
93 32
            $text = preg_replace('/\s+/', ' ', $node->wholeText);
94
        } else {
95 32
            $text = '';
96
97 32
            if ($node->childNodes !== null) {
98 32
                foreach ($node->childNodes as $childNode) {
99 32
                    $text .= self::convertDomNodeToText($childNode);
100
                }
101
            }
102
103 32
            switch ($node->nodeName) {
104 32
                case 'h1':
105 32
                case 'h2':
106 32
                case 'h3':
107 32
                case 'h4':
108 32
                case 'h5':
109 32
                case 'h6':
110 32
                case 'p':
111 32
                case 'ul':
112 32
                case 'div':
113 32
                    $text = "\n\n" . $text . "\n\n";
114 32
                    break;
115
116 32
                case 'li':
117 11
                    $text = '- ' . $text . "\n";
118 11
                    break;
119
120 32
                case 'br':
121 32
                    $text .= "\n";
122 32
                    break;
123
            }
124
        }
125
126 32
        return $text;
127
    }
128
129
    /**
130
     * @param array        $array
131
     * @param array|string $path
132
     * @param string       $glue
133
     *
134
     * @return mixed
135
     */
136 1
    public static function getValue(array &$array, $path, string $glue = '.')
137
    {
138 1
        if (!\is_array($path)) {
139 1
            $path = explode($glue, (string) $path);
140
        }
141
142 1
        $ref = &$array;
143
144 1
        foreach ((array) $path as $parent) {
145 1
            if (\is_array($ref) && \array_key_exists($parent, $ref)) {
146 1
                $ref = &$ref[$parent];
147
            } else {
148
                return null;
149
            }
150
        }
151
152 1
        return $ref;
153
    }
154
}
155