ExternPage::parseLdJson() - Code Metrics - Inspection of "Refac ExternPage, ExternRefTransformer" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 9538c8...6c70b1 )

by Dispositif

created 2023-04-16 14:54 UTC

ExternPage::parseLdJson() B

↳ Parent: ExternPage

Complexity

Conditions	8
Paths	5

Size

Total Lines	27
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	8
eloc	14
nc	5
nop	1
dl	0
loc	27
rs	8.4444
c	0
b	0
f	0

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);


namespace App\Domain\ExternLink;

use App\Application\Http\ExternHttpClient;
use App\Domain\Utils\TextUtil;
use App\Infrastructure\InternetDomainParser;
use Exception;
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;

/**
 * Représente une page web d'un Lien Externe (hors wiki)
 * Class ExternPage
 * @package App\Domain
 */
class ExternPage
{
    // todo move to config
    protected const PRETTY_DOMAIN_EXCLUSION
        = [
            '.中国',
            '.gov',
            '.free.fr',
            '.gouv.fr',
            '.com.cn',
            'site.google.com',
            'wordpress.com',
            'blogspot.com',
        ];

    /**
     * @var string
     */
    private $url;

    /**
     * @var string
     */
    private $html;

    /** @var TagParserInterface|null */
    private $tagParser;

    /** @var InternetDomainParserInterface|null */
    private $domainParser;

    /** @var LoggerInterface */
    private $log;

    /**
     * ExternPage constructor.
     * @throws Exception
     */
    public function __construct(
        string                         $url,
        string                         $html,
        ?TagParserInterface            $tagParser = null,
        ?InternetDomainParserInterface $domainParser = null,
        ?LoggerInterface               $log = null
    )
    {
        if (!ExternHttpClient::isHttpURL($url)) {
            throw new Exception('string is not an URL ' . $url);
        }
        $this->url = $url;
        $this->html = $html;
        $this->tagParser = $tagParser;
        $this->domainParser = $domainParser;
        $this->log = $log ?? new NullLogger();
    }

    public function getUrl(): string
    {
        return $this->url;
    }

    public function getData(): array
    {
        $ld = $this->parseLdJson($this->html);
        $meta = $this->parseMetaTags($this->html);

        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
        $meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
        $meta['html-url'] = $this->url;
        $meta['prettyDomainName'] = $this->getPrettyDomainName();
        $meta['robots'] = $this->getMetaRobotsContent($this->html);

        return ['JSON-LD' => $ld, 'meta' => $meta];
    }

    /**
     * extract LD-JSON metadata from <script type="application/ld+json">.
     * @throws Exception
     */
    private function parseLdJson(string $html): array
    {
        if (!$this->tagParser instanceof TagParserInterface) {
            return [];
        }

        $results = $this->tagParser->importHtml($html)->xpathResults(
            '//script[@type="application/ld+json"]'
        );

        foreach ($results as $result) {
            $json = trim($result);
            // filtrage empty value (todo?)
            if ($json === '') {
                continue;
            }
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
            if (!is_array($data)
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
            ) {
                continue;
            }

            return $data;
        }

        return [];
    }

    /**
     * todo move? /refac/delete?
     */
    private function parseMetaTags(string $str): array
    {
        $pattern = '
              ~<\s*meta\s
              # using lookahead to capture type to $1
                (?=[^>]*?
                \b(?:name|property|http-equiv)\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              )
              # capture content to $2
              [^>]*?\bcontent\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              [^>]*>
              ~ix';

        if (preg_match_all($pattern, $str, $out)) {
            $combine = array_combine($out[1], $out[2]);

            return $combine ?: [];
        }

        return [];
    }

    /**
     * test.com => test.com
     * bla.test.com => test.com
     * test.co.uk => test.co.uk (national commercial subdomain)
     * site.google.com => site.google.com (blog)
     * bla.site.google.com => site.google.com (blog)
     */
    public function getPrettyDomainName(): string
    {
        // Parse custom exceptions (free.fr, gouv.fr, etc)
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
            if (TextUtil::str_ends_with($rawDomain, $end)) {
                return $this->sanitizeSubDomain($rawDomain);
            }
        }

        // Parse using InternetDomainParser library
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data
    }

    /**
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
     * @throws Exception
     */
    public function getRegistrableSubDomain(): ?string
    {
        try {
            if (!ExternHttpClient::isHttpURL($this->url)) {
                throw new Exception('string is not an URL ' . $this->url);
            }
            if (!$this->domainParser instanceof InternetDomainParserInterface) {
                $this->log->notice('InternetDomainParser is not set');

                return null;
            }

            return $this->domainParser->getRegistrableDomainFromURL($this->url);
        } catch (Exception $e) {
            if ($this->log !== null) {
                $this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url);
            }
            throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e);
        }
    }

    /**
     * Extract language from <html lang="en-us"> tag.
     */
    private function parseHtmlLang(string $html): ?string
    {
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
            return $matches[1];
        }

        return null;
    }

    /**
     * Extract webpage title from HTML <title>
     * not foolproof : example <!-- <title>bla</title> -->
     */
    private function parseHtmlTitle(string $html): ?string
    {
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * Extract first <h1> from HTML.
     */
    private function parseHtmlFirstH1(string $html): ?string
    {
        if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * TODO strip not unicode characters ?
     * TODO add initial capital letter ?
     * This method is used to sanitize subdomain name.
     * WTF ?!?!?!
     */
    protected function sanitizeSubDomain(string $subDomain): string
    {
        return str_replace('www.', '', $subDomain);
    }

    /**
     * Extract robots meta tag content.
     * <meta name="robots" content="noindex,noarchive">
     */
    private function getMetaRobotsContent(string $html): string
    {
        if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) {
            return $matches[1];
        }

        return '';
    }
}


1			<?php
2			/*
3			* This file is part of dispositif/wikibot application (@github)
4			* 2019-2023 © Philippe M./Irønie <[email protected]>
5			* For the full copyright and MIT license information, view the license file.
6			*/
7
8			declare(strict_types=1);
9
10
11			namespace App\Domain\ExternLink;
12
13			use App\Application\Http\ExternHttpClient;
14			use App\Domain\Utils\TextUtil;
15			use App\Infrastructure\InternetDomainParser;
16			use Exception;
17			use Psr\Log\LoggerInterface;
18			use Psr\Log\NullLogger;
19
20			/**
21			* Représente une page web d'un Lien Externe (hors wiki)
22			* Class ExternPage
23			* @package App\Domain
24			*/
25			class ExternPage
26			{
27			// todo move to config
28			protected const PRETTY_DOMAIN_EXCLUSION
29			= [
30			'.中国',
31			'.gov',
32			'.free.fr',
33			'.gouv.fr',
34			'.com.cn',
35			'site.google.com',
36			'wordpress.com',
37			'blogspot.com',
38			];
39
40			/**
41			* @var string
42			*/
43			private $url;
44
45			/**
46			* @var string
47			*/
48			private $html;
49
50			/** @var TagParserInterface\|null */
51			private $tagParser;
52
53			/** @var InternetDomainParserInterface\|null */
54			private $domainParser;
55
56			/** @var LoggerInterface */
57			private $log;
58
59			/**
60			* ExternPage constructor.
61			* @throws Exception
62			*/
63			public function __construct(
64			string $url,
65			string $html,
66			?TagParserInterface $tagParser = null,
67			?InternetDomainParserInterface $domainParser = null,
68			?LoggerInterface $log = null
69			)
70			{
71			if (!ExternHttpClient::isHttpURL($url)) {
72			throw new Exception('string is not an URL ' . $url);
73			}
74			$this->url = $url;
75			$this->html = $html;
76			$this->tagParser = $tagParser;
77			$this->domainParser = $domainParser;
78			$this->log = $log ?? new NullLogger();
79			}
80
81			public function getUrl(): string
82			{
83			return $this->url;
84			}
85
86			public function getData(): array
87			{
88			$ld = $this->parseLdJson($this->html);
89			$meta = $this->parseMetaTags($this->html);
90
91			$meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
92			$meta['html-title'] = $this->parseHtmlTitle($this->html);
93			$meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
94			$meta['html-url'] = $this->url;
95			$meta['prettyDomainName'] = $this->getPrettyDomainName();
96			$meta['robots'] = $this->getMetaRobotsContent($this->html);
97
98			return ['JSON-LD' => $ld, 'meta' => $meta];
99			}
100
101			/**
102			* extract LD-JSON metadata from <script type="application/ld+json">.
103			* @throws Exception
104			*/
105			private function parseLdJson(string $html): array
106			{
107			if (!$this->tagParser instanceof TagParserInterface) {
108			return [];
109			}
110
111			$results = $this->tagParser->importHtml($html)->xpathResults(
112			'//script[@type="application/ld+json"]'
113			);
114
115			foreach ($results as $result) {
116			$json = trim($result);
117			// filtrage empty value (todo?)
118			if ($json === '') {
119			continue;
120			}
121			$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
122			if (!is_array($data)
123			\|\| (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
124			) {
125			continue;
126			}
127
128			return $data;
129			}
130
131			return [];
132			}
133
134			/**
135			* todo move? /refac/delete?
136			*/
137			private function parseMetaTags(string $str): array
138			{
139			$pattern = '
140			~<\s*meta\s
141			# using lookahead to capture type to $1
142			(?=[^>]*?
143			\b(?:name\|property\|http-equiv)\s=\s
144			(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
145			([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
146			)
147			# capture content to $2
148			[^>]?\bcontent\s=\s*
149			(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
150			([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
151			[^>]*>
152			~ix';
153
154			if (preg_match_all($pattern, $str, $out)) {
155			$combine = array_combine($out[1], $out[2]);
156
157			return $combine ?: [];
158			}
159
160			return [];
161			}
162
163			/**
164			* test.com => test.com
165			* bla.test.com => test.com
166			* test.co.uk => test.co.uk (national commercial subdomain)
167			* site.google.com => site.google.com (blog)
168			* bla.site.google.com => site.google.com (blog)
169			*/
170			public function getPrettyDomainName(): string
171			{
172			// Parse custom exceptions (free.fr, gouv.fr, etc)
173			$rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing
174			foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
175			if (TextUtil::str_ends_with($rawDomain, $end)) {
176			return $this->sanitizeSubDomain($rawDomain);
177			}
178			}
179
180			// Parse using InternetDomainParser library
181			return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data
182			}
183
184			/**
185			* "http://www.bla.co.uk/fubar" => "bla.co.uk"
186			* @throws Exception
187			*/
188			public function getRegistrableSubDomain(): ?string
189			{
190			try {
191			if (!ExternHttpClient::isHttpURL($this->url)) {
192			throw new Exception('string is not an URL ' . $this->url);
193			}
194			if (!$this->domainParser instanceof InternetDomainParserInterface) {
195			$this->log->notice('InternetDomainParser is not set');
196
197			return null;
198			}
199
200			return $this->domainParser->getRegistrableDomainFromURL($this->url);
201			} catch (Exception $e) {
202			if ($this->log !== null) {
203			$this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url);
204			}
205			throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e);
206			}
207			}
208
209			/**
210			* Extract language from <html lang="en-us"> tag.
211			*/
212			private function parseHtmlLang(string $html): ?string
213			{
214			if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
215			return $matches[1];
216			}
217
218			return null;
219			}
220
221			/**
222			* Extract webpage title from HTML <title>
223			* not foolproof : example <!-- <title>bla</title> -->
224			*/
225			private function parseHtmlTitle(string $html): ?string
226			{
227			if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
228			return trim(strip_tags($matches[1]));
229			}
230
231			return null;
232			}
233
234			/**
235			* Extract first <h1> from HTML.
236			*/
237			private function parseHtmlFirstH1(string $html): ?string
238			{
239			if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
240			return trim(strip_tags($matches[1]));
241			}
242
243			return null;
244			}
245
246			/**
247			* TODO strip not unicode characters ?
248			* TODO add initial capital letter ?
249			* This method is used to sanitize subdomain name.
250			* WTF ?!?!?!
251			*/
252			protected function sanitizeSubDomain(string $subDomain): string
253			{
254			return str_replace('www.', '', $subDomain);
255			}
256
257			/**
258			* Extract robots meta tag content.
259			* <meta name="robots" content="noindex,noarchive">
260			*/
261			private function getMetaRobotsContent(string $html): string
262			{
263			if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) {
264			return $matches[1];
265			}
266
267			return '';
268			}
269			}
270

Dispositif / Wikibot

Push — master ( 9538c8...6c70b1 )

ExternPage::parseLdJson() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like