ExternPage::parseHtmlFirstH1() - Code Metrics - Inspection of "style, cosmetic" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( dff8a4...2556d0 )

by Dispositif

created 2023-04-08 11:33 UTC

ExternPage::parseHtmlFirstH1() A

↳ Parent: ExternPage

Complexity

Conditions	2
Paths	2

Size

Total Lines	7
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	6

Importance

Changes

Metric	Value
cc	2
eloc	3
c	0
b	0
f	0
nc	2
nop	1
dl	0
loc	7
ccs	0
cts	0
cp	0
crap	6
rs	10

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);


namespace App\Domain;

use App\Application\Http\ExternHttpClient;
use App\Domain\Utils\TextUtil;
use App\Infrastructure\InternetDomainParser;
use App\Infrastructure\TagParser;
use Exception;
use Psr\Log\LoggerInterface;

/**
 * Représente une page web d'un Lien Externe (hors wiki)
 * Class ExternPage
 *
 * @package App\Domain
 */
class ExternPage
{
    // todo move to config
    protected const PRETTY_DOMAIN_EXCLUSION
        = [
            '.中国',
            '.gov',
            '.free.fr',
            '.gouv.fr',
            '.com.cn',
            'site.google.com',
            'wordpress.com',
            'blogspot.com',
        ];

    /**
     * @var string
     */
    private $url;

    /**
     * @var string
     */
    private $html;

    /**
     * @var LoggerInterface|null
     */
    private $log;

    /**
     * ExternPage constructor.
     *
     * @throws Exception
     */
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
    {
        if (!ExternHttpClient::isHttpURL($url)) {
            throw new Exception('string is not an URL '.$url);
        }
        $this->url = $url;
        $this->html = $html;
        $this->log = $log;
    }

    public function getUrl(): string
    {
        return $this->url;
    }

    public function getData(): array
    {
        $ld = $this->parseLdJson($this->html);
        $meta = $this->parseMetaTags($this->html);

        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
        $meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
        $meta['html-url'] = $this->url;
        $meta['prettyDomainName'] = $this->getPrettyDomainName();

        return ['JSON-LD' => $ld, 'meta' => $meta];
    }

    /**
     * extract LD-JSON metadata from <script type="application/ld+json">.
     *
     * @throws Exception
     */
    private function parseLdJson(string $html): array
    {
        $parser = new TagParser();
        $results = $parser->importHtml($html)->xpathResults(
            '//script[@type="application/ld+json"]'
        );

        foreach ($results as $result) {
            $json = trim($result);
            // filtrage empty value (todo?)
            if ($json === '') {
                continue;
            }
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
            if (!is_array($data)
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
            ) {
                continue;
            }

            return $data;
        }

        return [];
    }

    /**
     * todo move? /refac/delete?
     */
    private function parseMetaTags(string $str): array
    {
        $pattern = '
              ~<\s*meta\s
              # using lookahead to capture type to $1
                (?=[^>]*?
                \b(?:name|property|http-equiv)\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              )
              # capture content to $2
              [^>]*?\bcontent\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              [^>]*>
              ~ix';

        if (preg_match_all($pattern, $str, $out)) {
            $combine = array_combine($out[1], $out[2]);

            return $combine ?: [];
        }

        return [];
    }

    /**
     * test.com => test.com
     * bla.test.com => test.com
     * test.co.uk => test.co.uk (national commercial subdomain)
     * site.google.com => site.google.com (blog)
     * bla.site.google.com => site.google.com (blog)
     */
    public function getPrettyDomainName(): string
    {
        // Parse custom exceptions (free.fr, gouv.fr, etc)
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url);
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
            if (TextUtil::str_ends_with($rawDomain, $end)) {
                return $this->sanitizeSubDomain($rawDomain);
            }
        }

        // Parse using InternetDomainParser library
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain());
    }

    /**
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
     * @throws Exception
     */
    public function getRegistrableSubDomain(): string
    {
        try {
            if (!ExternHttpClient::isHttpURL($this->url)) {
                throw new Exception('string is not an URL '.$this->url);
            }

            return InternetDomainParser::getRegistrableDomainFromURL($this->url);
        } catch (Exception $e) {
            if ($this->log !== null) {
                $this->log->warning('InternetDomainParser::getRegistrableDomainFromURL NULL '.$this->url);
            }
            throw new Exception('InternetDomainParser::getRegistrableDomainFromURL NULL', $e->getCode(), $e);
        }
    }

    /**
     * Extract language from <html lang="en-us"> tag.
     */
    private function parseHtmlLang(string $html): ?string
    {
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
            return $matches[1];
        }

        return null;
    }

    /**
     * Extract webpage title from HTML <title>
     * not foolproof : example <!-- <title>bla</title> -->
     */
    private function parseHtmlTitle(string $html): ?string
    {
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * Extract first <h1> from HTML.
     */
    private function parseHtmlFirstH1(string $html): ?string
    {
        if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * TODO strip not unicode characters ?
     * TODO add initial capital letter ?
     * This method is used to sanitize subdomain name.
     * WTF ?!?!?!
     */
    protected function sanitizeSubDomain(string $subDomain): string
    {
        return str_replace('www.', '', $subDomain);
    }
}


1		<?php
2		/*
3		* This file is part of dispositif/wikibot application (@github)
4		* 2019-2023 © Philippe M./Irønie <[email protected]>
5		* For the full copyright and MIT license information, view the license file.
6		*/
7
8		declare(strict_types=1);
9
10
11		namespace App\Domain;
12
13		use App\Application\Http\ExternHttpClient;
14		use App\Domain\Utils\TextUtil;
15		use App\Infrastructure\InternetDomainParser;
16		use App\Infrastructure\TagParser;
17		use Exception;
18		use Psr\Log\LoggerInterface;
19
20		/**
21		* Représente une page web d'un Lien Externe (hors wiki)
22		* Class ExternPage
23		*
24		* @package App\Domain
25		*/
26		class ExternPage
27		{
28		// todo move to config
29		protected const PRETTY_DOMAIN_EXCLUSION
30		= [
31		'.中国',
32		'.gov',
33		'.free.fr',
34		'.gouv.fr',
35		'.com.cn',
36		'site.google.com',
37		'wordpress.com',
38		'blogspot.com',
39		];
40
41		/**
42		* @var string
43		*/
44		private $url;
45
46		/**
47		* @var string
48		*/
49		private $html;
50	5
51		/**
52	5	* @var LoggerInterface\|null
53		*/
54		private $log;
55	5
56	5	/**
57	5	* ExternPage constructor.
58	5	*
59		* @throws Exception
60		*/
61		public function __construct(string $url, string $html, ?LoggerInterface $log = null)
62		{
63		if (!ExternHttpClient::isHttpURL($url)) {
64		throw new Exception('string is not an URL '.$url);
65		}
66		$this->url = $url;
67		$this->html = $html;
68		$this->log = $log;
69		}
70
71		public function getUrl(): string
72	5	{
73		return $this->url;
74	5	}
75	5
76		public function getData(): array
77	5	{
78		$ld = $this->parseLdJson($this->html);
79		$meta = $this->parseMetaTags($this->html);
80
81		$meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
82		$meta['html-title'] = $this->parseHtmlTitle($this->html);
83		$meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
84		$meta['html-url'] = $this->url;
85		$meta['prettyDomainName'] = $this->getPrettyDomainName();
86
87		return ['JSON-LD' => $ld, 'meta' => $meta];
88		}
89	5
90		/**
91	5	* extract LD-JSON metadata from <script type="application/ld+json">.
92	5	*
93	5	* @throws Exception
94		*/
95		private function parseLdJson(string $html): array
96	5	{
97	3	$parser = new TagParser();
98		$results = $parser->importHtml($html)->xpathResults(
99	3	'//script[@type="application/ld+json"]'
100		);
101
102	3	foreach ($results as $result) {
103	3	$json = trim($result);
104		// filtrage empty value (todo?)
105		if ($json === '') {
106		continue;
107	3	}
108		$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
109		if (!is_array($data)
110	2	\|\| (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
111		) {
112		continue;
113		}
114
115		return $data;
116		}
117
118		return [];
119		}
120	5
121		/**
122	5	* todo move? /refac/delete?
123		*/
124		private function parseMetaTags(string $str): array
125		{
126		$pattern = '
127		~<\s*meta\s
128		# using lookahead to capture type to $1
129		(?=[^>]*?
130		\b(?:name\|property\|http-equiv)\s=\s
131		(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
132		([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
133		)
134		# capture content to $2
135		[^>]?\bcontent\s=\s*
136		(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
137	5	([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
138	5	[^>]*>
139		~ix';
140	5
141		if (preg_match_all($pattern, $str, $out)) {
142		$combine = array_combine($out[1], $out[2]);
143
144		return $combine ?: [];
145		}
146
147		return [];
148		}
149
150		/**
151		* test.com => test.com
152		* bla.test.com => test.com
153		* test.co.uk => test.co.uk (national commercial subdomain)
154		* site.google.com => site.google.com (blog)
155		* bla.site.google.com => site.google.com (blog)
156		*/
157		public function getPrettyDomainName(): string
158		{
159		// Parse custom exceptions (free.fr, gouv.fr, etc)
160		$rawDomain = InternetDomainParser::extractSubdomainString($this->url);
161		foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
162		if (TextUtil::str_ends_with($rawDomain, $end)) {
163		return $this->sanitizeSubDomain($rawDomain);
164		}
165		}
166
167		// Parse using InternetDomainParser library
168		return $this->sanitizeSubDomain($this->getRegistrableSubDomain());
169		}
170
171		/**
172		* "http://www.bla.co.uk/fubar" => "bla.co.uk"
173		* @throws Exception
174		*/
175		public function getRegistrableSubDomain(): string
176		{
177		try {
178		if (!ExternHttpClient::isHttpURL($this->url)) {
179		throw new Exception('string is not an URL '.$this->url);
180		}
181
182		return InternetDomainParser::getRegistrableDomainFromURL($this->url);
183		} catch (Exception $e) {
184		if ($this->log !== null) {
185		$this->log->warning('InternetDomainParser::getRegistrableDomainFromURL NULL '.$this->url);
186		}
187		throw new Exception('InternetDomainParser::getRegistrableDomainFromURL NULL', $e->getCode(), $e);
188		}
189		}
190
191		/**
192		* Extract language from <html lang="en-us"> tag.
193		*/
194		private function parseHtmlLang(string $html): ?string
195		{
196		if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
197		return $matches[1];
198		}
199
200		return null;
201		}
202
203		/**
204		* Extract webpage title from HTML <title>
205		* not foolproof : example <!-- <title>bla</title> -->
206		*/
207		private function parseHtmlTitle(string $html): ?string
208		{
209		if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
210		return trim(strip_tags($matches[1]));
211		}
212
213		return null;
214		}
215
216		/**
217		* Extract first <h1> from HTML.
218		*/
219		private function parseHtmlFirstH1(string $html): ?string
220		{
221		if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
222		return trim(strip_tags($matches[1]));
223		}
224
225		return null;
226		}
227
228		/**
229		* TODO strip not unicode characters ?
230		* TODO add initial capital letter ?
231		* This method is used to sanitize subdomain name.
232		* WTF ?!?!?!
233		*/
234		protected function sanitizeSubDomain(string $subDomain): string
235		{
236		return str_replace('www.', '', $subDomain);
237		}
238		}
239

Dispositif / Wikibot

Push — master ( dff8a4...2556d0 )

ExternPage::parseHtmlFirstH1() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like