Passed
Push — master ( 5e44c4...8c6c38 )
by Alexey
03:59 queued 11s
created

ScraperUtil::createDomDocument()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2.0625

Importance

Changes 0
Metric Value
eloc 8
c 0
b 0
f 0
dl 0
loc 14
ccs 6
cts 8
cp 0.75
rs 10
cc 2
nc 2
nop 1
crap 2.0625
1
<?php
2
3
declare(strict_types=1);
4
5
/**
6
 * @author   Ne-Lexa
7
 * @license  MIT
8
 *
9
 * @see      https://github.com/Ne-Lexa/google-play-scraper
10
 */
11
12
namespace Nelexa\GPlay\Util;
13
14
/**
15
 * @internal
16
 */
17
class ScraperUtil
18
{
19
    /**
20
     * @param string $html
21
     *
22
     * @return array
23
     */
24 62
    public static function extractScriptData(string $html): array
25
    {
26 62
        $scripts = [];
27
28 62
        if (preg_match_all('/>AF_initDataCallback[\s\S]*?<\/script/', $html, $matches)) {
29 62
            $scripts = array_reduce(
30 62
                $matches[0],
31
                static function ($carry, $item) {
32
                    if (
33 62
                        preg_match("/(ds:.*?)'/", $item, $keyMatch) &&
34 62
                        preg_match('/return ([\s\S]*?)}}\);<\//', $item, $valueMatch)
35
                    ) {
36 62
                        $carry[$keyMatch[1]] = \GuzzleHttp\json_decode($valueMatch[1], true);
37
                    }
38
39 62
                    return $carry;
40 62
                },
41 62
                $scripts
42
            );
43
        }
44
45 62
        return $scripts;
46
    }
47
48
    /**
49
     * @param string $html
50
     *
51
     * @return \DOMDocument
52
     */
53 47
    public static function createDomDocument(string $html): \DOMDocument
54
    {
55 47
        $doc = new \DOMDocument();
56 47
        $internalErrors = libxml_use_internal_errors(true);
57
58 47
        if (!$doc->loadHTML('<?xml encoding="utf-8"?>' . $html)) {
59
            throw new
60
            \RuntimeException(
61
                'error load html: ' . $html
62
            );
63
        }
64 47
        libxml_use_internal_errors($internalErrors);
65
66 47
        return $doc;
67
    }
68
69
    /**
70
     * @param string $html
71
     *
72
     * @return string
73
     */
74 47
    public static function html2text(string $html): string
75
    {
76 47
        $doc = self::createDomDocument($html);
77 47
        $text = self::convertDomNodeToText($doc);
78 47
        $text = preg_replace('/\n{3,}/', "\n\n", trim($text));
79
80 47
        return trim($text);
81
    }
82
83
    /**
84
     * @param \DOMNode $node
85
     *
86
     * @return string
87
     */
88 47
    private static function convertDomNodeToText(\DOMNode $node): string
89
    {
90 47
        if ($node instanceof \DOMText) {
91 47
            $text = preg_replace('/\s+/', ' ', $node->wholeText);
92
        } else {
93 47
            $text = '';
94
95 47
            if ($node->childNodes !== null) {
96 47
                foreach ($node->childNodes as $childNode) {
97 47
                    $text .= self::convertDomNodeToText($childNode);
98
                }
99
            }
100
101 47
            switch ($node->nodeName) {
102 47
                case 'h1':
103 47
                case 'h2':
104 47
                case 'h3':
105 47
                case 'h4':
106 47
                case 'h5':
107 47
                case 'h6':
108 47
                case 'p':
109 47
                case 'ul':
110 47
                case 'div':
111 47
                    $text = "\n\n" . $text . "\n\n";
112 47
                    break;
113
114 47
                case 'li':
115
                    $text = '- ' . $text . "\n";
116
                    break;
117
118 47
                case 'br':
119 45
                    $text .= "\n";
120 45
                    break;
121
            }
122
        }
123
124 47
        return $text;
125
    }
126
}
127