1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
/* |
6
|
|
|
* Copyright (c) Ne-Lexa |
7
|
|
|
* |
8
|
|
|
* For the full copyright and license information, please view |
9
|
|
|
* the LICENSE file that was distributed with this source code. |
10
|
|
|
* |
11
|
|
|
* @see https://github.com/Ne-Lexa/google-play-scraper |
12
|
|
|
*/ |
13
|
|
|
|
14
|
|
|
namespace Nelexa\GPlay\Util; |
15
|
|
|
|
16
|
|
|
/** |
17
|
|
|
* @internal |
18
|
|
|
*/ |
19
|
|
|
class ScraperUtil |
20
|
|
|
{ |
21
|
|
|
/** |
22
|
|
|
* @param string $html |
23
|
|
|
* |
24
|
|
|
* @return array |
25
|
|
|
*/ |
26
|
25 |
|
public static function extractScriptData(string $html): array |
27
|
|
|
{ |
28
|
25 |
|
$scripts = []; |
29
|
|
|
|
30
|
25 |
|
preg_match_all('/>AF_initDataCallback\((.*?)\);<\/script/s', $html, $matches); |
31
|
25 |
|
if ($matches) { |
32
|
25 |
|
$scripts = array_reduce( |
33
|
25 |
|
$matches[0], |
34
|
25 |
|
static function ($carry, $item) { |
35
|
|
|
if ( |
36
|
25 |
|
preg_match("/(ds:.*?)'/", $item, $keyMatch) |
37
|
25 |
|
&& preg_match('/data:([\s\S]*?)(, }\);<\/|, sideChannel:)/', $item, $valueMatch) |
38
|
|
|
) { |
39
|
25 |
|
$carry[$keyMatch[1]] = \GuzzleHttp\json_decode($valueMatch[1], true); |
|
|
|
|
40
|
|
|
} |
41
|
|
|
|
42
|
25 |
|
return $carry; |
43
|
|
|
}, |
44
|
|
|
$scripts |
45
|
|
|
); |
46
|
|
|
} |
47
|
|
|
|
48
|
25 |
|
return $scripts; |
49
|
|
|
} |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* @param string $html |
53
|
|
|
* |
54
|
|
|
* @return \DOMDocument |
55
|
|
|
*/ |
56
|
32 |
|
public static function createDomDocument(string $html): \DOMDocument |
57
|
|
|
{ |
58
|
32 |
|
$doc = new \DOMDocument(); |
59
|
32 |
|
$internalErrors = libxml_use_internal_errors(true); |
60
|
|
|
|
61
|
32 |
|
if (!$doc->loadHTML('<?xml encoding="utf-8"?>' . $html)) { |
62
|
|
|
throw new \RuntimeException( |
63
|
|
|
'error load html: ' . $html |
64
|
|
|
); |
65
|
|
|
} |
66
|
32 |
|
libxml_use_internal_errors($internalErrors); |
67
|
|
|
|
68
|
32 |
|
return $doc; |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* @param string $html |
73
|
|
|
* |
74
|
|
|
* @return string |
75
|
|
|
*/ |
76
|
32 |
|
public static function html2text(string $html): string |
77
|
|
|
{ |
78
|
32 |
|
$doc = self::createDomDocument($html); |
79
|
32 |
|
$text = self::convertDomNodeToText($doc); |
80
|
32 |
|
$text = preg_replace('/\n{3,}/', "\n\n", trim($text)); |
81
|
|
|
|
82
|
32 |
|
return trim($text); |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* @param \DOMNode $node |
87
|
|
|
* |
88
|
|
|
* @return string |
89
|
|
|
*/ |
90
|
32 |
|
private static function convertDomNodeToText(\DOMNode $node): string |
91
|
|
|
{ |
92
|
32 |
|
if ($node instanceof \DOMText) { |
93
|
32 |
|
$text = preg_replace('/\s+/', ' ', $node->wholeText); |
94
|
|
|
} else { |
95
|
32 |
|
$text = ''; |
96
|
|
|
|
97
|
32 |
|
if ($node->childNodes !== null) { |
98
|
32 |
|
foreach ($node->childNodes as $childNode) { |
99
|
32 |
|
$text .= self::convertDomNodeToText($childNode); |
100
|
|
|
} |
101
|
|
|
} |
102
|
|
|
|
103
|
32 |
|
switch ($node->nodeName) { |
104
|
32 |
|
case 'h1': |
105
|
32 |
|
case 'h2': |
106
|
32 |
|
case 'h3': |
107
|
32 |
|
case 'h4': |
108
|
32 |
|
case 'h5': |
109
|
32 |
|
case 'h6': |
110
|
32 |
|
case 'p': |
111
|
32 |
|
case 'ul': |
112
|
32 |
|
case 'div': |
113
|
32 |
|
$text = "\n\n" . $text . "\n\n"; |
114
|
32 |
|
break; |
115
|
|
|
|
116
|
32 |
|
case 'li': |
117
|
11 |
|
$text = '- ' . $text . "\n"; |
118
|
11 |
|
break; |
119
|
|
|
|
120
|
32 |
|
case 'br': |
121
|
32 |
|
$text .= "\n"; |
122
|
32 |
|
break; |
123
|
|
|
} |
124
|
|
|
} |
125
|
|
|
|
126
|
32 |
|
return $text; |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* @param array $array |
131
|
|
|
* @param array|string $path |
132
|
|
|
* @param string $glue |
133
|
|
|
* |
134
|
|
|
* @return mixed |
135
|
|
|
*/ |
136
|
1 |
|
public static function getValue(array &$array, $path, string $glue = '.') |
137
|
|
|
{ |
138
|
1 |
|
if (!\is_array($path)) { |
139
|
1 |
|
$path = explode($glue, (string) $path); |
140
|
|
|
} |
141
|
|
|
|
142
|
1 |
|
$ref = &$array; |
143
|
|
|
|
144
|
1 |
|
foreach ((array) $path as $parent) { |
145
|
1 |
|
if (\is_array($ref) && \array_key_exists($parent, $ref)) { |
146
|
1 |
|
$ref = &$ref[$parent]; |
147
|
|
|
} else { |
148
|
|
|
return null; |
149
|
|
|
} |
150
|
|
|
} |
151
|
|
|
|
152
|
1 |
|
return $ref; |
153
|
|
|
} |
154
|
|
|
} |
155
|
|
|
|
This function has been deprecated. The supplier of the function has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.