ExternPage::parseHtmlLang() - Code Metrics - Inspection of "Refac http domain parsing" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 766a39...696a12 )

by Dispositif

created 2023-03-22 13:35 UTC

ExternPage::parseHtmlLang() A

↳ Parent: ExternPage

Complexity

Conditions	2
Paths	2

Size

Total Lines	7
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	6

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	2
eloc	3
c	1
b	0
f	0
nc	2
nop	1
dl	0
loc	7
ccs	0
cts	0
cp	0
crap	6
rs	10

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);


namespace App\Domain;

use App\Application\Http\ExternHttpClient;
use App\Domain\Utils\TextUtil;
use App\Infrastructure\InternetDomainParser;
use App\Infrastructure\TagParser;
use Exception;
use Psr\Log\LoggerInterface;

/**
 * Représente une page web d'un Lien Externe (hors wiki)
 * Class ExternPage
 *
 * @package App\Domain
 */
class ExternPage
{
    // todo move to config
    protected const PRETTY_DOMAIN_EXCLUSION
        = [
            '.中国',
            '.gov',
            '.free.fr',
            '.gouv.fr',
            '.com.cn',
            'site.google.com',
        ];

    /**
     * @var string
     */
    private $url;

    /**
     * @var string
     */
    private $html;

    /**
     * @var LoggerInterface|null
     */
    private $log;

    /**
     * ExternPage constructor.
     *
     * @param string               $url
     * @param string               $html
     * @param LoggerInterface|null $log
     *
     * @throws Exception
     */
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
    {
        if (!ExternHttpClient::isHttpURL($url)) {
            throw new Exception('string is not an URL '.$url);
        }
        $this->url = $url;
        $this->html = $html;
        $this->log = $log;
    }

    /**
     * @return string
     */
    public function getUrl(): string
    {
        return $this->url;
    }

    /**
     * @return array
     * @throws Exception
     */
    public function getData(): array
    {
        $ld = $this->parseLdJson($this->html);
        $meta = $this->parseMetaTags($this->html);

        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
        $meta['html-url'] = $this->url;

        return ['JSON-LD' => $ld, 'meta' => $meta];
    }

    /**
     * extract LD-JSON metadata from <script type="application/ld+json">.
     *
     * @param string $html
     *
     * @return array
     * @throws Exception
     * @throws Exception
     */
    private function parseLdJson(string $html): array
    {
        $parser = new TagParser();
        $results = $parser->importHtml($html)->xpathResults(
            '//script[@type="application/ld+json"]'
        );

        foreach ($results as $result) {
            $json = trim($result);
            // filtrage empty value (todo?)
            if ($json === '') {
                continue;
            }
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
            if (!is_array($data)
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
            ) {
                continue;
            }

            return $data;
        }

        return [];
    }

    /**
     * todo move? /refac/delete?
     *
     * @param string $str
     *
     * @return array
     */
    private function parseMetaTags(string $str): array
    {
        $pattern = '
              ~<\s*meta\s
              # using lookahead to capture type to $1
                (?=[^>]*?
                \b(?:name|property|http-equiv)\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              )
              # capture content to $2
              [^>]*?\bcontent\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              [^>]*>
              ~ix';

        if (preg_match_all($pattern, $str, $out)) {
            $combine = array_combine($out[1], $out[2]);

            return $combine ?: [];
        }

        return [];
    }

    /**
     * test.com => test.com
     * bla.test.com => test.com
     * test.co.uk => test.co.uk (national commercial subdomain)
     * site.google.com => site.google.com (blog)
     * bla.site.google.com => site.google.com (blog)
     *
     * @throws Exception
     */
    public function getPrettyDomainName(): string
    {
        // Parse custom exceptions (free.fr, gouv.fr, etc)
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url);
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
            if (TextUtil::str_ends_with($rawDomain, $end)) {
                return $this->sanitizeSubDomain($rawDomain);
            }
        }

        // Parse using InternetDomainParser library
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain());
    }

    /**
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
     * @throws Exception
     */
    public function getRegistrableSubDomain(): string
    {
        try {
            if (!ExternHttpClient::isHttpURL($this->url)) {
                throw new \Exception('string is not an URL '.$this->url);
            }

            return InternetDomainParser::getRegistrableDomainFromURL($this->url);
        } catch (Exception $e) {
            if ($this->log !== null) {
                $this->log->warning('InternetDomainParser::getRegistrableDomainFromURL NULL '.$this->url);
            }
            throw new Exception('InternetDomainParser::getRegistrableDomainFromURL NULL', $e->getCode(), $e);
        }
    }

    /**
     * Extract language from <html lang="en-us"> tag.
     *
     * @param string $html
     *
     * @return string|null
     */
    private function parseHtmlLang(string $html): ?string
    {
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
            return $matches[1];
        }

        return null;
    }

    /**
     * Extract webpage title from HTML <title>
     * not foolproof : example <!-- <title>bla</title> -->
     *
     * @param string $html
     *
     * @return string|null
     */
    private function parseHtmlTitle(string $html): ?string
    {
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * TODO strip not unicode characters ?
     * TODO add initial capital letter ?
     * This method is used to sanitize subdomain name.
     * WTF ?!?!?!
     */
    protected function sanitizeSubDomain(string $subDomain): string
    {
        return str_replace('www.', '', $subDomain);
    }
}


1		<?php
2		/*
3		* This file is part of dispositif/wikibot application (@github)
4		* 2019-2023 © Philippe M./Irønie <[email protected]>
5		* For the full copyright and MIT license information, view the license file.
6		*/
7
8		declare(strict_types=1);
9
10
11		namespace App\Domain;
12
13		use App\Application\Http\ExternHttpClient;
14		use App\Domain\Utils\TextUtil;
15		use App\Infrastructure\InternetDomainParser;
16		use App\Infrastructure\TagParser;
17		use Exception;
18		use Psr\Log\LoggerInterface;
19
20		/**
21		* Représente une page web d'un Lien Externe (hors wiki)
22		* Class ExternPage
23		*
24		* @package App\Domain
25		*/
26		class ExternPage
27		{
28		// todo move to config
29		protected const PRETTY_DOMAIN_EXCLUSION
30		= [
31		'.中国',
32		'.gov',
33		'.free.fr',
34		'.gouv.fr',
35		'.com.cn',
36		'site.google.com',
37		];
38
39		/**
40		* @var string
41		*/
42		private $url;
43
44		/**
45		* @var string
46		*/
47		private $html;
48
49		/**
50	5	* @var LoggerInterface\|null
51		*/
52	5	private $log;
53
54		/**
55	5	* ExternPage constructor.
56	5	*
57	5	* @param string $url
58	5	* @param string $html
59		* @param LoggerInterface\|null $log
60		*
61		* @throws Exception
62		*/
63		public function __construct(string $url, string $html, ?LoggerInterface $log = null)
64		{
65		if (!ExternHttpClient::isHttpURL($url)) {
66		throw new Exception('string is not an URL '.$url);
67		}
68		$this->url = $url;
69		$this->html = $html;
70		$this->log = $log;
71		}
72	5
73		/**
74	5	* @return string
75	5	*/
76		public function getUrl(): string
77	5	{
78		return $this->url;
79		}
80
81		/**
82		* @return array
83		* @throws Exception
84		*/
85		public function getData(): array
86		{
87		$ld = $this->parseLdJson($this->html);
88		$meta = $this->parseMetaTags($this->html);
89	5
90		$meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
91	5	$meta['html-title'] = $this->parseHtmlTitle($this->html);
92	5	$meta['html-url'] = $this->url;
93	5
94		return ['JSON-LD' => $ld, 'meta' => $meta];
95		}
96	5
97	3	/**
98		* extract LD-JSON metadata from <script type="application/ld+json">.
99	3	*
100		* @param string $html
101		*
102	3	* @return array
103	3	* @throws Exception
104		* @throws Exception
105		*/
106		private function parseLdJson(string $html): array
107	3	{
108		$parser = new TagParser();
109		$results = $parser->importHtml($html)->xpathResults(
110	2	'//script[@type="application/ld+json"]'
111		);
112
113		foreach ($results as $result) {
114		$json = trim($result);
115		// filtrage empty value (todo?)
116		if ($json === '') {
117		continue;
118		}
119		$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
120	5	if (!is_array($data)
121		\|\| (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
122	5	) {
123		continue;
124		}
125
126		return $data;
127		}
128
129		return [];
130		}
131
132		/**
133		* todo move? /refac/delete?
134		*
135		* @param string $str
136		*
137	5	* @return array
138	5	*/
139		private function parseMetaTags(string $str): array
140	5	{
141		$pattern = '
142		~<\s*meta\s
143		# using lookahead to capture type to $1
144		(?=[^>]*?
145		\b(?:name\|property\|http-equiv)\s=\s
146		(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
147		([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
148		)
149		# capture content to $2
150		[^>]?\bcontent\s=\s*
151		(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
152		([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
153		[^>]*>
154		~ix';
155
156		if (preg_match_all($pattern, $str, $out)) {
157		$combine = array_combine($out[1], $out[2]);
158
159		return $combine ?: [];
160		}
161
162		return [];
163		}
164
165		/**
166		* test.com => test.com
167		* bla.test.com => test.com
168		* test.co.uk => test.co.uk (national commercial subdomain)
169		* site.google.com => site.google.com (blog)
170		* bla.site.google.com => site.google.com (blog)
171		*
172		* @throws Exception
173		*/
174		public function getPrettyDomainName(): string
175		{
176		// Parse custom exceptions (free.fr, gouv.fr, etc)
177		$rawDomain = InternetDomainParser::extractSubdomainString($this->url);
178		foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
179		if (TextUtil::str_ends_with($rawDomain, $end)) {
180		return $this->sanitizeSubDomain($rawDomain);
181		}
182		}
183
184		// Parse using InternetDomainParser library
185		return $this->sanitizeSubDomain($this->getRegistrableSubDomain());
186		}
187
188		/**
189		* "http://www.bla.co.uk/fubar" => "bla.co.uk"
190		* @throws Exception
191		*/
192		public function getRegistrableSubDomain(): string
193		{
194		try {
195		if (!ExternHttpClient::isHttpURL($this->url)) {
196		throw new \Exception('string is not an URL '.$this->url);
197		}
198
199		return InternetDomainParser::getRegistrableDomainFromURL($this->url);
200		} catch (Exception $e) {
201		if ($this->log !== null) {
202		$this->log->warning('InternetDomainParser::getRegistrableDomainFromURL NULL '.$this->url);
203		}
204		throw new Exception('InternetDomainParser::getRegistrableDomainFromURL NULL', $e->getCode(), $e);
205		}
206		}
207
208		/**
209		* Extract language from <html lang="en-us"> tag.
210		*
211		* @param string $html
212		*
213		* @return string\|null
214		*/
215		private function parseHtmlLang(string $html): ?string
216		{
217		if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
218		return $matches[1];
219		}
220
221		return null;
222		}
223
224		/**
225		* Extract webpage title from HTML <title>
226		* not foolproof : example <!-- <title>bla</title> -->
227		*
228		* @param string $html
229		*
230		* @return string\|null
231		*/
232		private function parseHtmlTitle(string $html): ?string
233		{
234		if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
235		return trim(strip_tags($matches[1]));
236		}
237
238		return null;
239		}
240
241		/**
242		* TODO strip not unicode characters ?
243		* TODO add initial capital letter ?
244		* This method is used to sanitize subdomain name.
245		* WTF ?!?!?!
246		*/
247		protected function sanitizeSubDomain(string $subDomain): string
248		{
249		return str_replace('www.', '', $subDomain);
250		}
251		}
252

Dispositif / Wikibot

Push — master ( 766a39...696a12 )

ExternPage::parseHtmlLang() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like