ExternPage::__construct() - Code Metrics - Inspection of "Fix OuvrageComplete::processSousTitre" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( ca9f9d...eb1f44 )

by Dispositif

created 2020-11-13 12:05 UTC

ExternPage::__construct() A

↳ Parent: ExternPage

Complexity

Conditions	2
Paths	2

Size

Total Lines	8
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	5
CRAP Score	2.0185

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
cc	2
eloc	5
nc	2
nop	3
dl	0
loc	8
ccs	5
cts	6
cp	0.8333
crap	2.0185
rs	10
c	1
b	0
f	1

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019/2020 © Philippe/Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);


namespace App\Domain;

use App\Application\Http\ExternHttpClient;
use App\Infrastructure\TagParser;
use Exception;
use Psr\Log\LoggerInterface;

/**
 * Représente une page web d'un Lien Externe (hors wiki)
 * Class ExternPage
 *
 * @package App\Domain
 */
class ExternPage
{
    /**
     * @var string
     */
    private $url;

    /**
     * @var string
     */
    private $html;

    /**
     * @var LoggerInterface|null
     */
    private $log;

    /**
     * ExternPage constructor.
     *
     * @param string               $url
     * @param string               $html
     * @param LoggerInterface|null $log
     *
     * @throws Exception
     */
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
    {
        if (!ExternHttpClient::isWebURL($url)) {
            throw new Exception('string is not an URL '.$url);
        }
        $this->url = $url;
        $this->html = $html;
        $this->log = $log;
    }

    /**
     * @return string
     */
    public function getUrl(): string
    {
        return $this->url;
    }

    /**
     * @return array
     * @throws Exception
     */
    public function getData(): array
    {
        $ld = $this->parseLdJson($this->html);
        $meta = $this->parseMetaTags($this->html);

        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
        $meta['html-title'] = $this->parseHtmlTitle($this->html);

        return ['JSON-LD' => $ld, 'meta' => $meta];
    }

    /**
     * extract LD-JSON metadata from <script type="application/ld+json">.
     *
     * @param string $html
     *
     * @return array
     * @throws Exception
     * @throws Exception
     */
    private function parseLdJson(string $html): array
    {
        $parser = new TagParser();
        $results = $parser->importHtml($html)->xpathResults(
            '//script[@type="application/ld+json"]'
        );

        foreach ($results as $result) {
            $json = trim($result);
            // filtrage empty value (todo?)
            if (0 === strlen($json)) {
                continue;
            }
            $data = json_decode($json, true);
            if (!is_array($data)
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
            ) {
                continue;
            }

            return $data;
        }

        return [];
    }

    /**
     * todo move? /refac/delete?
     *
     * @param string $str
     *
     * @return array
     */
    private function parseMetaTags(string $str): array
    {
        $pattern = '
              ~<\s*meta\s
              # using lookahead to capture type to $1
                (?=[^>]*?
                \b(?:name|property|http-equiv)\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              )
              # capture content to $2
              [^>]*?\bcontent\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              [^>]*>
              ~ix';

        if (preg_match_all($pattern, $str, $out)) {
            $combine = array_combine($out[1], $out[2]);

            return $combine ? $combine : [];
        }

        return [];
    }

    /**
     * todo refactor
     * todo optimize "https://www6.nhk.or.jp" => "nhk.or.jp"
     * test.com => test.com
     * bla.test.com => test.com
     * test.co.uk => test.co.uk (national commercial subdomain)
     * site.google.com => site.google.com (blog)
     *
     * @return string
     * @throws Exception
     */
    public function getPrettyDomainName(): string
    {
        $subDomain = $this->getSubDomain();

        if (!strpos($subDomain, '.uk') && !strpos($subDomain, '.jp') && !strpos($subDomain, '.ma')
            && !strpos($subDomain, '.kr')
            && strpos($subDomain, 'site.google.com') === false
        ) {
            // bla.test.com => Test.com
            if (preg_match('#\w+\.\w+$#', $subDomain, $matches)) {
                return $matches[0];
            }
        }

        return $subDomain;
    }

    /**
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
     * @return string|null
     * @throws Exception
     */
    public function getSubDomain(): string
    {
        try {
            return ExternDomains::extractSubDomain($this->url);
        } catch (Exception $e) {
            if ($this->log) {
                $this->log->warning('ExternDomains::extractSubDomain NULL '.$this->url);
            }
            throw new Exception('ExternDomains::extractSubDomain NULL');
        }
    }

    /**
     * Extract language from <html lang="en-us"> tag.
     *
     * @param string $html
     *
     * @return string|null
     */
    private function parseHtmlLang(string $html): ?string
    {
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
            return $matches[1];
        }

        return null;
    }

    /**
     * Extract webpage title from HTML <title>
     * not foolproof : example <!-- <title>bla</title> -->
     *
     * @param string $html
     *
     * @return string|null
     */
    private function parseHtmlTitle(string $html): ?string
    {
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }
}


1		<?php
2		/*
3		* This file is part of dispositif/wikibot application (@github)
4		* 2019/2020 © Philippe/Irønie <[email protected]>
5		* For the full copyright and MIT license information, view the license file.
6		*/
7
8		declare(strict_types=1);
9
10
11		namespace App\Domain;
12
13		use App\Application\Http\ExternHttpClient;
14		use App\Infrastructure\TagParser;
15		use Exception;
16		use Psr\Log\LoggerInterface;
17
18		/**
19		* Représente une page web d'un Lien Externe (hors wiki)
20		* Class ExternPage
21		*
22		* @package App\Domain
23		*/
24		class ExternPage
25		{
26		/**
27		* @var string
28		*/
29		private $url;
30
31		/**
32		* @var string
33		*/
34		private $html;
35
36		/**
37		* @var LoggerInterface\|null
38		*/
39		private $log;
40
41		/**
42		* ExternPage constructor.
43		*
44		* @param string $url
45		* @param string $html
46		* @param LoggerInterface\|null $log
47		*
48		* @throws Exception
49		*/
50	5	public function __construct(string $url, string $html, ?LoggerInterface $log = null)
51		{
52	5	if (!ExternHttpClient::isWebURL($url)) {
53		throw new Exception('string is not an URL '.$url);
54		}
55	5	$this->url = $url;
56	5	$this->html = $html;
57	5	$this->log = $log;
58	5	}
59
60		/**
61		* @return string
62		*/
63		public function getUrl(): string
64		{
65		return $this->url;
66		}
67
68		/**
69		* @return array
70		* @throws Exception
71		*/
72	5	public function getData(): array
73		{
74	5	$ld = $this->parseLdJson($this->html);
75	5	$meta = $this->parseMetaTags($this->html);
76
77	5	$meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
78		$meta['html-title'] = $this->parseHtmlTitle($this->html);
79
80		return ['JSON-LD' => $ld, 'meta' => $meta];
81		}
82
83		/**
84		* extract LD-JSON metadata from <script type="application/ld+json">.
85		*
86		* @param string $html
87		*
88		* @return array
89	5	* @throws Exception
90		* @throws Exception
91	5	*/
92	5	private function parseLdJson(string $html): array
93	5	{
94		$parser = new TagParser();
95		$results = $parser->importHtml($html)->xpathResults(
96	5	'//script[@type="application/ld+json"]'
97	3	);
98
99	3	foreach ($results as $result) {
100		$json = trim($result);
101		// filtrage empty value (todo?)
102	3	if (0 === strlen($json)) {
103	3	continue;
104		}
105		$data = json_decode($json, true);
106		if (!is_array($data)
107	3	\|\| (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
108		) {
109		continue;
110	2	}
111
112		return $data;
113		}
114
115		return [];
116		}
117
118		/**
119		* todo move? /refac/delete?
120	5	*
121		* @param string $str
122	5	*
123		* @return array
124		*/
125		private function parseMetaTags(string $str): array
126		{
127		$pattern = '
128		~<\s*meta\s
129		# using lookahead to capture type to $1
130		(?=[^>]*?
131		\b(?:name\|property\|http-equiv)\s=\s
132		(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
133		([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
134		)
135		# capture content to $2
136		[^>]?\bcontent\s=\s*
137	5	(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
138	5	([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
139		[^>]*>
140	5	~ix';
141
142		if (preg_match_all($pattern, $str, $out)) {
143		$combine = array_combine($out[1], $out[2]);
144
145		return $combine ? $combine : [];
146		}
147
148		return [];
149		}
150
151		/**
152		* todo refactor
153		* todo optimize "https://www6.nhk.or.jp" => "nhk.or.jp"
154		* test.com => test.com
155		* bla.test.com => test.com
156		* test.co.uk => test.co.uk (national commercial subdomain)
157		* site.google.com => site.google.com (blog)
158		*
159		* @return string
160		* @throws Exception
161		*/
162		public function getPrettyDomainName(): string
163		{
164		$subDomain = $this->getSubDomain();
165
166		if (!strpos($subDomain, '.uk') && !strpos($subDomain, '.jp') && !strpos($subDomain, '.ma')
167		&& !strpos($subDomain, '.kr')
168		&& strpos($subDomain, 'site.google.com') === false
169		) {
170		// bla.test.com => Test.com
171		if (preg_match('#\w+\.\w+$#', $subDomain, $matches)) {
172		return $matches[0];
173		}
174		}
175
176		return $subDomain;
177		}
178
179		/**
180		* "http://www.bla.co.uk/fubar" => "bla.co.uk"
181		* @return string\|null
182		* @throws Exception
183		*/
184		public function getSubDomain(): string
185		{
186		try {
187		return ExternDomains::extractSubDomain($this->url);
188		} catch (Exception $e) {
189		if ($this->log) {
190		$this->log->warning('ExternDomains::extractSubDomain NULL '.$this->url);
191		}
192		throw new Exception('ExternDomains::extractSubDomain NULL');
193		}
194		}
195
196		/**
197		* Extract language from <html lang="en-us"> tag.
198		*
199		* @param string $html
200		*
201		* @return string\|null
202		*/
203		private function parseHtmlLang(string $html): ?string
204		{
205		if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
206		return $matches[1];
207		}
208
209		return null;
210		}
211
212		/**
213		* Extract webpage title from HTML <title>
214		* not foolproof : example <!-- <title>bla</title> -->
215		*
216		* @param string $html
217		*
218		* @return string\|null
219		*/
220		private function parseHtmlTitle(string $html): ?string
221		{
222		if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
223		return trim(strip_tags($matches[1]));
224		}
225
226		return null;
227		}
228		}
229

Dispositif / Wikibot

Push — master ( ca9f9d...eb1f44 )

ExternPage::__construct() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like