Passed
Push — master ( 26e53d...a73e13 )
by Dispositif
09:44
created

ExternPage::getPrettyDomainName()   A

Complexity

Conditions 6
Paths 3

Size

Total Lines 14
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 42

Importance

Changes 3
Bugs 0 Features 1
Metric Value
cc 6
eloc 6
c 3
b 0
f 1
nc 3
nop 0
dl 0
loc 14
ccs 0
cts 5
cp 0
crap 42
rs 9.2222
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Infrastructure\TagParser;
15
use Exception;
16
use Psr\Log\LoggerInterface;
17
18
/**
19
 * Représente une page web d'un Lien Externe (hors wiki)
20
 * Class ExternPage
21
 *
22
 * @package App\Domain
23
 */
24
class ExternPage
25
{
26
    /**
27
     * @var string
28
     */
29
    private $url;
30
31
    /**
32
     * @var string
33
     */
34
    private $html;
35
36
    /**
37
     * @var LoggerInterface|null
38
     */
39
    private $log;
40
41
    /**
42
     * ExternPage constructor.
43
     *
44
     * @param string               $url
45
     * @param string               $html
46
     * @param LoggerInterface|null $log
47
     *
48
     * @throws Exception
49
     */
50 5
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
51
    {
52 5
        if (!ExternHttpClient::isWebURL($url)) {
53
            throw new Exception('string is not an URL '.$url);
54
        }
55 5
        $this->url = $url;
56 5
        $this->html = $html;
57 5
        $this->log = $log;
58 5
    }
59
60
    /**
61
     * @return string
62
     */
63
    public function getUrl(): string
64
    {
65
        return $this->url;
66
    }
67
68
    /**
69
     * @return array
70
     * @throws Exception
71
     */
72 5
    public function getData(): array
73
    {
74 5
        $ld = $this->parseLdJson($this->html);
75 5
        $meta = $this->parseMetaTags($this->html);
76
77 5
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
78
79
        return ['JSON-LD' => $ld, 'meta' => $meta];
80
    }
81
82
    /**
83
     * extract LD-JSON metadata from <script type="application/ld+json">.
84
     *
85
     * @param string $html
86
     *
87
     * @return array
88
     * @throws Exception
89 5
     * @throws Exception
90
     */
91 5
    private function parseLdJson(string $html): array
92 5
    {
93 5
        $parser = new TagParser();
94
        $results = $parser->importHtml($html)->xpathResults(
95
            '//script[@type="application/ld+json"]'
96 5
        );
97 3
98
        foreach ($results as $result) {
99 3
            $json = trim($result);
100
            // filtrage empty value (todo?)
101
            if (0 === strlen($json)) {
102 3
                continue;
103 3
            }
104
            $data = json_decode($json, true);
105
            if (!is_array($data)
106
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
107 3
            ) {
108
                continue;
109
            }
110 2
111
            return $data;
112
        }
113
114
        return [];
115
    }
116
117
    /**
118
     * todo move? /refac/delete?
119
     *
120 5
     * @param string $str
121
     *
122 5
     * @return array
123
     */
124
    private function parseMetaTags(string $str): array
125
    {
126
        $pattern = '
127
              ~<\s*meta\s
128
              # using lookahead to capture type to $1
129
                (?=[^>]*?
130
                \b(?:name|property|http-equiv)\s*=\s*
131
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
132
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
133
              )
134
              # capture content to $2
135
              [^>]*?\bcontent\s*=\s*
136
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
137 5
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
138 5
              [^>]*>
139
              ~ix';
140 5
141
        if (preg_match_all($pattern, $str, $out)) {
142
            $combine = array_combine($out[1], $out[2]);
143
144
            return $combine ? $combine : [];
145
        }
146
147
        return [];
148
    }
149
150
    /**
151
     * test.com => test.com
152
     * bla.test.com => test.com
153
     * test.co.uk => test.co.uk (national commercial subdomain)
154
     * site.google.com => site.google.com (blog)
155
     *
156
     * @return string
157
     * @throws Exception
158
     */
159
    public function getPrettyDomainName(): string
160
    {
161
        $subDomain = $this->getSubDomain();
162
163
        if (!strpos($subDomain, '.co.uk') && !strpos($subDomain, '.co.ma') && !strpos($subDomain, '.co.kr')
164
            && !strpos($subDomain, 'site.google.')
165
        ) {
166
            // bla.test.com => Test.com
167
            if (preg_match('#\w+\.\w+$#', $subDomain, $matches)) {
168
                return $matches[0];
169
            }
170
        }
171
172
        return $subDomain;
173
    }
174
175
    /**
176
     * @return string|null
177
     * @throws Exception
178
     */
179
    public function getSubDomain(): string
180
    {
181
        try {
182
            return ExternDomains::extractSubDomain($this->url);
183
        } catch (Exception $e) {
184
            if ($this->log) {
185
                $this->log->warning('ExternDomains::extractSubDomain NULL '.$this->url);
186
            }
187
            throw new Exception('ExternDomains::extractSubDomain NULL');
188
        }
189
    }
190
191
    /**
192
     * Extract language from <html lang="en-us"> tag.
193
     *
194
     * @param string $html
195
     */
196
    private function parseHtmlLang(string $html): ?string
197
    {
198
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
199
            // 'en-us' => 'en' // todo move in Language
200
            $lang = preg_replace('#^([a-z]+)-[a-z]+$#i', '$1', $matches[1]);
201
202
            return $lang;
203
        }
204
205
        return null;
206
    }
207
}
208