Passed
Push — master ( dafac1...9ad278 )
by Dispositif
08:36
created

ExternPage::parseLdJson()   B

Complexity

Conditions 7
Paths 4

Size

Total Lines 24
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 7.1782

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 7
eloc 13
nc 4
nop 1
dl 0
loc 24
ccs 11
cts 13
cp 0.8462
crap 7.1782
rs 8.8333
c 1
b 0
f 1
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Infrastructure\TagParser;
15
use Exception;
16
use Psr\Log\LoggerInterface;
17
18
/**
19
 * Représente une page web d'un Lien Externe (hors wiki)
20
 * Class ExternPage
21
 *
22
 * @package App\Domain
23
 */
24
class ExternPage
25
{
26
    /**
27
     * @var string
28
     */
29
    private $url;
30
31
    /**
32
     * @var string
33
     */
34
    private $html;
35
36
    /**
37
     * @var LoggerInterface|null
38
     */
39
    private $log;
40
41
    /**
42
     * ExternPage constructor.
43
     *
44
     * @param string               $url
45
     * @param string               $html
46
     * @param LoggerInterface|null $log
47
     *
48
     * @throws Exception
49
     */
50 5
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
51
    {
52 5
        if (!ExternHttpClient::isWebURL($url)) {
53
            throw new Exception('string is not an URL '.$url);
54
        }
55 5
        $this->url = $url;
56 5
        $this->html = $html;
57 5
        $this->log = $log;
58 5
    }
59
60
    /**
61
     * @return string
62
     */
63
    public function getUrl(): string
64
    {
65
        return $this->url;
66
    }
67
68
    /**
69
     * @return array
70
     * @throws Exception
71
     */
72 5
    public function getData(): array
73
    {
74 5
        $ld = $this->parseLdJson($this->html);
75 5
        $meta = $this->parseMetaTags($this->html);
76
77 5
        return ['JSON-LD' => $ld, 'meta' => $meta];
78
    }
79
80
    /**
81
     * extract LD-JSON metadata from <script type="application/ld+json">.
82
     *
83
     * @param string $html
84
     *
85
     * @return array
86
     * @throws Exception
87
     * @throws Exception
88
     */
89 5
    private function parseLdJson(string $html): array
90
    {
91 5
        $parser = new TagParser();
92 5
        $results = $parser->importHtml($html)->xpathResults(
93 5
            '//script[@type="application/ld+json"]'
94
        );
95
96 5
        foreach ($results as $result) {
97 3
            $json = trim($result);
98
            // filtrage empty value (todo?)
99 3
            if (0 === strlen($json)) {
100
                continue;
101
            }
102 3
            $data = json_decode($json, true);
103 3
            if (!is_array($data)
104
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
105
            ) {
106
                continue;
107 3
            }
108
109
            return $data;
110 2
        }
111
112
        return [];
113
    }
114
115
    /**
116
     * todo move? /refac/delete?
117
     *
118
     * @param string $str
119
     *
120 5
     * @return array
121
     */
122 5
    private function parseMetaTags(string $str): array
123
    {
124
        $pattern = '
125
              ~<\s*meta\s
126
              # using lookahead to capture type to $1
127
                (?=[^>]*?
128
                \b(?:name|property|http-equiv)\s*=\s*
129
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
130
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
131
              )
132
              # capture content to $2
133
              [^>]*?\bcontent\s*=\s*
134
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
135
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
136
              [^>]*>
137 5
              ~ix';
138 5
139
        if (preg_match_all($pattern, $str, $out)) {
140 5
            $combine = array_combine($out[1], $out[2]);
141
142
            return $combine ? $combine : [];
143
        }
144
145
        return [];
146
    }
147
148
    /**
149
     * test.com => test.com
150
     * bla.test.com => test.com
151
     * test.co.uk => test.co.uk (national commercial subdomain)
152
     * site.google.com => site.google.com (blog)
153
     *
154
     * @return string
155
     * @throws Exception
156
     */
157
    public function getPrettyDomainName(): string
158
    {
159
        $subDomain = $this->getSubDomain();
160
161
        if (!strpos($subDomain, '.co.uk') && !strpos($subDomain, '.co.ma') && !strpos($subDomain, '.co.kr')
162
            && !strpos($subDomain, 'site.google.')
163
        ) {
164
            // bla.test.com => Test.com
165
            if (preg_match('#\w+\.\w+$#', $subDomain, $matches)) {
166
                return $matches[0];
167
            }
168
        }
169
170
        return $subDomain;
171
    }
172
173
    /**
174
     * @return string|null
175
     * @throws Exception
176
     */
177
    public function getSubDomain(): string
178
    {
179
        try {
180
            return ExternDomains::extractSubDomain($this->url);
181
        } catch (Exception $e) {
182
            if ($this->log) {
183
                $this->log->warning('ExternDomains::extractSubDomain NULL '.$this->url);
184
            }
185
            throw new Exception('ExternDomains::extractSubDomain NULL');
186
        }
187
    }
188
}
189