Passed
Push — master ( ca9f9d...eb1f44 )
by Dispositif
07:46
created

ExternPage::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2.0185

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 2
eloc 5
nc 2
nop 3
dl 0
loc 8
ccs 5
cts 6
cp 0.8333
crap 2.0185
rs 10
c 1
b 0
f 1
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe/Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Infrastructure\TagParser;
15
use Exception;
16
use Psr\Log\LoggerInterface;
17
18
/**
19
 * Représente une page web d'un Lien Externe (hors wiki)
20
 * Class ExternPage
21
 *
22
 * @package App\Domain
23
 */
24
class ExternPage
25
{
26
    /**
27
     * @var string
28
     */
29
    private $url;
30
31
    /**
32
     * @var string
33
     */
34
    private $html;
35
36
    /**
37
     * @var LoggerInterface|null
38
     */
39
    private $log;
40
41
    /**
42
     * ExternPage constructor.
43
     *
44
     * @param string               $url
45
     * @param string               $html
46
     * @param LoggerInterface|null $log
47
     *
48
     * @throws Exception
49
     */
50 5
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
51
    {
52 5
        if (!ExternHttpClient::isWebURL($url)) {
53
            throw new Exception('string is not an URL '.$url);
54
        }
55 5
        $this->url = $url;
56 5
        $this->html = $html;
57 5
        $this->log = $log;
58 5
    }
59
60
    /**
61
     * @return string
62
     */
63
    public function getUrl(): string
64
    {
65
        return $this->url;
66
    }
67
68
    /**
69
     * @return array
70
     * @throws Exception
71
     */
72 5
    public function getData(): array
73
    {
74 5
        $ld = $this->parseLdJson($this->html);
75 5
        $meta = $this->parseMetaTags($this->html);
76
77 5
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
78
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
79
80
        return ['JSON-LD' => $ld, 'meta' => $meta];
81
    }
82
83
    /**
84
     * extract LD-JSON metadata from <script type="application/ld+json">.
85
     *
86
     * @param string $html
87
     *
88
     * @return array
89 5
     * @throws Exception
90
     * @throws Exception
91 5
     */
92 5
    private function parseLdJson(string $html): array
93 5
    {
94
        $parser = new TagParser();
95
        $results = $parser->importHtml($html)->xpathResults(
96 5
            '//script[@type="application/ld+json"]'
97 3
        );
98
99 3
        foreach ($results as $result) {
100
            $json = trim($result);
101
            // filtrage empty value (todo?)
102 3
            if (0 === strlen($json)) {
103 3
                continue;
104
            }
105
            $data = json_decode($json, true);
106
            if (!is_array($data)
107 3
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
108
            ) {
109
                continue;
110 2
            }
111
112
            return $data;
113
        }
114
115
        return [];
116
    }
117
118
    /**
119
     * todo move? /refac/delete?
120 5
     *
121
     * @param string $str
122 5
     *
123
     * @return array
124
     */
125
    private function parseMetaTags(string $str): array
126
    {
127
        $pattern = '
128
              ~<\s*meta\s
129
              # using lookahead to capture type to $1
130
                (?=[^>]*?
131
                \b(?:name|property|http-equiv)\s*=\s*
132
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
133
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
134
              )
135
              # capture content to $2
136
              [^>]*?\bcontent\s*=\s*
137 5
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
138 5
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
139
              [^>]*>
140 5
              ~ix';
141
142
        if (preg_match_all($pattern, $str, $out)) {
143
            $combine = array_combine($out[1], $out[2]);
144
145
            return $combine ? $combine : [];
146
        }
147
148
        return [];
149
    }
150
151
    /**
152
     * todo refactor
153
     * todo optimize "https://www6.nhk.or.jp" => "nhk.or.jp"
154
     * test.com => test.com
155
     * bla.test.com => test.com
156
     * test.co.uk => test.co.uk (national commercial subdomain)
157
     * site.google.com => site.google.com (blog)
158
     *
159
     * @return string
160
     * @throws Exception
161
     */
162
    public function getPrettyDomainName(): string
163
    {
164
        $subDomain = $this->getSubDomain();
165
166
        if (!strpos($subDomain, '.uk') && !strpos($subDomain, '.jp') && !strpos($subDomain, '.ma')
167
            && !strpos($subDomain, '.kr')
168
            && strpos($subDomain, 'site.google.com') === false
169
        ) {
170
            // bla.test.com => Test.com
171
            if (preg_match('#\w+\.\w+$#', $subDomain, $matches)) {
172
                return $matches[0];
173
            }
174
        }
175
176
        return $subDomain;
177
    }
178
179
    /**
180
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
181
     * @return string|null
182
     * @throws Exception
183
     */
184
    public function getSubDomain(): string
185
    {
186
        try {
187
            return ExternDomains::extractSubDomain($this->url);
188
        } catch (Exception $e) {
189
            if ($this->log) {
190
                $this->log->warning('ExternDomains::extractSubDomain NULL '.$this->url);
191
            }
192
            throw new Exception('ExternDomains::extractSubDomain NULL');
193
        }
194
    }
195
196
    /**
197
     * Extract language from <html lang="en-us"> tag.
198
     *
199
     * @param string $html
200
     *
201
     * @return string|null
202
     */
203
    private function parseHtmlLang(string $html): ?string
204
    {
205
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
206
            return $matches[1];
207
        }
208
209
        return null;
210
    }
211
212
    /**
213
     * Extract webpage title from HTML <title>
214
     * not foolproof : example <!-- <title>bla</title> -->
215
     *
216
     * @param string $html
217
     *
218
     * @return string|null
219
     */
220
    private function parseHtmlTitle(string $html): ?string
221
    {
222
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
223
            return trim(strip_tags($matches[1]));
224
        }
225
226
        return null;
227
    }
228
}
229