Passed
Push — master ( a73e13...03a489 )
by Dispositif
15:32
created

ExternPage::parseHtmlLang()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
eloc 4
c 0
b 0
f 0
nc 2
nop 1
dl 0
loc 10
ccs 0
cts 0
cp 0
crap 6
rs 10
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Infrastructure\TagParser;
15
use Exception;
16
use Psr\Log\LoggerInterface;
17
18
/**
19
 * Représente une page web d'un Lien Externe (hors wiki)
20
 * Class ExternPage
21
 *
22
 * @package App\Domain
23
 */
24
class ExternPage
25
{
26
    /**
27
     * @var string
28
     */
29
    private $url;
30
31
    /**
32
     * @var string
33
     */
34
    private $html;
35
36
    /**
37
     * @var LoggerInterface|null
38
     */
39
    private $log;
40
41
    /**
42
     * ExternPage constructor.
43
     *
44
     * @param string               $url
45
     * @param string               $html
46
     * @param LoggerInterface|null $log
47
     *
48
     * @throws Exception
49
     */
50 5
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
51
    {
52 5
        if (!ExternHttpClient::isWebURL($url)) {
53
            throw new Exception('string is not an URL '.$url);
54
        }
55 5
        $this->url = $url;
56 5
        $this->html = $html;
57 5
        $this->log = $log;
58 5
    }
59
60
    /**
61
     * @return string
62
     */
63
    public function getUrl(): string
64
    {
65
        return $this->url;
66
    }
67
68
    /**
69
     * @return array
70
     * @throws Exception
71
     */
72 5
    public function getData(): array
73
    {
74 5
        $ld = $this->parseLdJson($this->html);
75 5
        $meta = $this->parseMetaTags($this->html);
76
77 5
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
78
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
79
80
        return ['JSON-LD' => $ld, 'meta' => $meta];
81
    }
82
83
    /**
84
     * extract LD-JSON metadata from <script type="application/ld+json">.
85
     *
86
     * @param string $html
87
     *
88
     * @return array
89 5
     * @throws Exception
90
     * @throws Exception
91 5
     */
92 5
    private function parseLdJson(string $html): array
93 5
    {
94
        $parser = new TagParser();
95
        $results = $parser->importHtml($html)->xpathResults(
96 5
            '//script[@type="application/ld+json"]'
97 3
        );
98
99 3
        foreach ($results as $result) {
100
            $json = trim($result);
101
            // filtrage empty value (todo?)
102 3
            if (0 === strlen($json)) {
103 3
                continue;
104
            }
105
            $data = json_decode($json, true);
106
            if (!is_array($data)
107 3
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
108
            ) {
109
                continue;
110 2
            }
111
112
            return $data;
113
        }
114
115
        return [];
116
    }
117
118
    /**
119
     * todo move? /refac/delete?
120 5
     *
121
     * @param string $str
122 5
     *
123
     * @return array
124
     */
125
    private function parseMetaTags(string $str): array
126
    {
127
        $pattern = '
128
              ~<\s*meta\s
129
              # using lookahead to capture type to $1
130
                (?=[^>]*?
131
                \b(?:name|property|http-equiv)\s*=\s*
132
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
133
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
134
              )
135
              # capture content to $2
136
              [^>]*?\bcontent\s*=\s*
137 5
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
138 5
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
139
              [^>]*>
140 5
              ~ix';
141
142
        if (preg_match_all($pattern, $str, $out)) {
143
            $combine = array_combine($out[1], $out[2]);
144
145
            return $combine ? $combine : [];
146
        }
147
148
        return [];
149
    }
150
151
    /**
152
     * test.com => test.com
153
     * bla.test.com => test.com
154
     * test.co.uk => test.co.uk (national commercial subdomain)
155
     * site.google.com => site.google.com (blog)
156
     *
157
     * @return string
158
     * @throws Exception
159
     */
160
    public function getPrettyDomainName(): string
161
    {
162
        $subDomain = $this->getSubDomain();
163
164
        if (!strpos($subDomain, '.co.uk') && !strpos($subDomain, '.co.ma') && !strpos($subDomain, '.co.kr')
165
            && !strpos($subDomain, 'site.google.')
166
        ) {
167
            // bla.test.com => Test.com
168
            if (preg_match('#\w+\.\w+$#', $subDomain, $matches)) {
169
                return $matches[0];
170
            }
171
        }
172
173
        return $subDomain;
174
    }
175
176
    /**
177
     * @return string|null
178
     * @throws Exception
179
     */
180
    public function getSubDomain(): string
181
    {
182
        try {
183
            return ExternDomains::extractSubDomain($this->url);
184
        } catch (Exception $e) {
185
            if ($this->log) {
186
                $this->log->warning('ExternDomains::extractSubDomain NULL '.$this->url);
187
            }
188
            throw new Exception('ExternDomains::extractSubDomain NULL');
189
        }
190
    }
191
192
    /**
193
     * Extract language from <html lang="en-us"> tag.
194
     *
195
     * @param string $html
196
     *
197
     * @return string|null
198
     */
199
    private function parseHtmlLang(string $html): ?string
200
    {
201
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
202
            // 'en-us' => 'en' // todo move in Language
203
            $lang = preg_replace('#^([a-z]+)-[a-z]+$#i', '$1', $matches[1]);
204
205
            return $lang;
206
        }
207
208
        return null;
209
    }
210
211
    /**
212
     * Extract webpage title from HTML <title>
213
     * not foolproof : example <!-- <title>bla</title> -->
214
     *
215
     * @param string $html
216
     *
217
     * @return string|null
218
     */
219
    private function parseHtmlTitle(string $html): ?string
220
    {
221
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
222
            return trim(strip_tags($matches[1]));
223
        }
224
225
        return null;
226
    }
227
}
228