Issues in ExternPage.php (master) - Issues in master - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Issues (106)

src/Domain/ExternLink/ExternPage.php (1 issue)

Labels

Bug 1

Severity

Major 1

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);


namespace App\Domain\ExternLink;

use App\Application\Utils\HttpUtil;
use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
use App\Domain\InfrastructurePorts\TagParserInterface;
use App\Domain\Utils\TextUtil;
use App\Infrastructure\InternetDomainParser;
use App\Infrastructure\Monitor\NullLogger;
use Exception;
use Psr\Log\LoggerInterface;

/**
 * Représente une page web d'un Lien Externe (hors wiki)
 * Class ExternPage
 * @package App\Domain
 */
class ExternPage
{
    // todo move to config
    protected const PRETTY_DOMAIN_EXCLUSION
        = [
            '.中国',
            '.gov',
            '.free.fr',
            '.gouv.fr',
            '.com.cn',
            'site.google.com',
            'wordpress.com',
            'blogspot.com',
        ];

    private readonly string $url;

    /**
     * ExternPage constructor.
     * @throws Exception
     */
    public function __construct(
        string                                          $url,
        private readonly string                         $html,
        private readonly ?TagParserInterface            $tagParser = null,
        private readonly ?InternetDomainParserInterface $domainParser = null,
        private readonly LoggerInterface                $log = new NullLogger()
    )
    {
        if (!HttpUtil::isHttpURL($url)) {
            throw new Exception('string is not an URL ' . $url);
        }
        $this->url = $url;

    }

    public function getUrl(): string
    {
        return $this->url;
    }

    public function getData(): array
    {
        $ld = $this->parseLdJson($this->html);
        $meta = $this->parseMetaTags($this->html);

        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
        $meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
        $meta['html-url'] = $this->url;
        $meta['prettyDomainName'] = $this->getPrettyDomainName();
        $meta['robots'] = $this->getMetaRobotsContent($this->html);

        return ['JSON-LD' => $ld, 'meta' => $meta];
    }

    /**
     * extract LD-JSON metadata from <script type="application/ld+json">.
     * @throws Exception
     */
    private function parseLdJson(string $html): array
    {
        if (!$this->tagParser instanceof TagParserInterface) {
            return [];
        }

        try {
            $results = $this->tagParser->importHtml($html)->xpathResults(
                '//script[@type="application/ld+json"]'
            );
        } catch (Exception $e) {
            $this->log->warning('TagParser->xpathResults NULL ' . $this->url);

            return [];
        }

        foreach ($results as $result) {
            $json = trim((string) $result);
            // filtrage empty value (todo?)
            if ($json === '') {
                continue;
            }
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
            if (!is_array($data)
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
            ) {
                continue;
            }

            return $data;
        }

        return [];
    }

    /**
     * todo move? /refac/delete?
     */
    private function parseMetaTags(string $str): array
    {
        $pattern = '
              ~<\s*meta\s
              # using lookahead to capture type to $1
                (?=[^>]*?
                \b(?:name|property|http-equiv)\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              )
              # capture content to $2
              [^>]*?\bcontent\s*=\s*
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
              [^>]*>
              ~ix';

        if (preg_match_all($pattern, $str, $out)) {
            $combine = array_combine($out[1], $out[2]);

            return $combine ?: [];
        }

        return [];
    }

    /**
     * test.com => test.com
     * bla.test.com => test.com
     * test.co.uk => test.co.uk (national commercial subdomain)
     * site.google.com => site.google.com (blog)
     * bla.site.google.com => site.google.com (blog)
     */
    public function getPrettyDomainName(): string
    {
        // Parse custom exceptions (free.fr, gouv.fr, etc)
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
            if (TextUtil::str_ends_with($rawDomain, $end)) {
                return $this->sanitizeSubDomain($rawDomain);
            }
        }

        // Parse using InternetDomainParser library
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data
    }

    /**
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
     * @throws Exception
     */
    public function getRegistrableSubDomain(): ?string
    {
        try {
            if (!HttpUtil::isHttpURL($this->url)) {
                throw new Exception('string is not an URL ' . $this->url);
            }
            if (!$this->domainParser instanceof InternetDomainParserInterface) {
                $this->log->notice('InternetDomainParser is not set');

                return null;
            }

            return $this->domainParser->getRegistrableDomainFromURL($this->url);
        } catch (Exception $e) {
            if ($this->log !== null) {
                $this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url);
            }
            throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e);
        }
    }

    /**
     * Extract language from <html lang="en-us"> tag.
     */
    private function parseHtmlLang(string $html): ?string
    {
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
            return $matches[1];
        }

        return null;
    }

    /**
     * Extract webpage title from HTML <title>
     * not foolproof : example <!-- <title>bla</title> -->
     */
    private function parseHtmlTitle(string $html): ?string
    {
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * Extract first <h1> from HTML.
     */
    private function parseHtmlFirstH1(string $html): ?string
    {
        if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
            return trim(strip_tags($matches[1]));
        }

        return null;
    }

    /**
     * TODO strip not unicode characters ?
     * TODO add initial capital letter ?
     * This method is used to sanitize subdomain name.
     * WTF ?!?!?!
     */
    protected function sanitizeSubDomain(string $subDomain): string
    {
        return str_replace('www.', '', $subDomain);
    }

    /**
     * Extract robots meta tag content.
     * <meta name="robots" content="noindex,noarchive">
     */
    private function getMetaRobotsContent(string $html): string
    {
        if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) {
            return $matches[1];
        }

        return '';
    }
}


1			<?php
2			/*
3			* This file is part of dispositif/wikibot application (@github)
4			* 2019-2023 © Philippe M./Irønie <[email protected]>
5			* For the full copyright and MIT license information, view the license file.
6			*/
7
8			declare(strict_types=1);
9
10
11			namespace App\Domain\ExternLink;
12
13			use App\Application\Utils\HttpUtil;
14			use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
15			use App\Domain\InfrastructurePorts\TagParserInterface;
16			use App\Domain\Utils\TextUtil;
17			use App\Infrastructure\InternetDomainParser;
18			use App\Infrastructure\Monitor\NullLogger;
19			use Exception;
20			use Psr\Log\LoggerInterface;
21
22			/**
23			* Représente une page web d'un Lien Externe (hors wiki)
24			* Class ExternPage
25			* @package App\Domain
26			*/
27			class ExternPage
28			{
29			// todo move to config
30			protected const PRETTY_DOMAIN_EXCLUSION
31			= [
32			'.中国',
33			'.gov',
34			'.free.fr',
35			'.gouv.fr',
36			'.com.cn',
37			'site.google.com',
38			'wordpress.com',
39			'blogspot.com',
40			];
41
42			private readonly string $url;
43
44			/**
45			* ExternPage constructor.
46			* @throws Exception
47			*/
48			public function __construct(
49			string $url,
50			private readonly string $html,
51			private readonly ?TagParserInterface $tagParser = null,
52			private readonly ?InternetDomainParserInterface $domainParser = null,
53			private readonly LoggerInterface $log = new NullLogger()
54			)
55			{
56			if (!HttpUtil::isHttpURL($url)) {
57			throw new Exception('string is not an URL ' . $url);
58			}
59			$this->url = $url;
			0 ignored issues – show Bug introduced 2023-04-30 09:21 UTC by Report Bug Copy Issue Report Show Similar Issues like this The property `url` is declared read-only in `App\Domain\ExternLink\ExternPage`. Loading history...
60			}
61
62			public function getUrl(): string
63			{
64			return $this->url;
65			}
66
67			public function getData(): array
68			{
69			$ld = $this->parseLdJson($this->html);
70			$meta = $this->parseMetaTags($this->html);
71
72			$meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
73			$meta['html-title'] = $this->parseHtmlTitle($this->html);
74			$meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
75			$meta['html-url'] = $this->url;
76			$meta['prettyDomainName'] = $this->getPrettyDomainName();
77			$meta['robots'] = $this->getMetaRobotsContent($this->html);
78
79			return ['JSON-LD' => $ld, 'meta' => $meta];
80			}
81
82			/**
83			* extract LD-JSON metadata from <script type="application/ld+json">.
84			* @throws Exception
85			*/
86			private function parseLdJson(string $html): array
87			{
88			if (!$this->tagParser instanceof TagParserInterface) {
89			return [];
90			}
91
92			try {
93			$results = $this->tagParser->importHtml($html)->xpathResults(
94			'//script[@type="application/ld+json"]'
95			);
96			} catch (Exception $e) {
97			$this->log->warning('TagParser->xpathResults NULL ' . $this->url);
98
99			return [];
100			}
101
102			foreach ($results as $result) {
103			$json = trim((string) $result);
104			// filtrage empty value (todo?)
105			if ($json === '') {
106			continue;
107			}
108			$data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
109			if (!is_array($data)
110			\|\| (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
111			) {
112			continue;
113			}
114
115			return $data;
116			}
117
118			return [];
119			}
120
121			/**
122			* todo move? /refac/delete?
123			*/
124			private function parseMetaTags(string $str): array
125			{
126			$pattern = '
127			~<\s*meta\s
128			# using lookahead to capture type to $1
129			(?=[^>]*?
130			\b(?:name\|property\|http-equiv)\s=\s
131			(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
132			([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
133			)
134			# capture content to $2
135			[^>]?\bcontent\s=\s*
136			(?\|"\s([^"]?)\s"\|\'\s([^\']?)\s\'\|
137			([^"\'>]?)(?=\s/?\s>\|\s\w+\s=))
138			[^>]*>
139			~ix';
140
141			if (preg_match_all($pattern, $str, $out)) {
142			$combine = array_combine($out[1], $out[2]);
143
144			return $combine ?: [];
145			}
146
147			return [];
148			}
149
150			/**
151			* test.com => test.com
152			* bla.test.com => test.com
153			* test.co.uk => test.co.uk (national commercial subdomain)
154			* site.google.com => site.google.com (blog)
155			* bla.site.google.com => site.google.com (blog)
156			*/
157			public function getPrettyDomainName(): string
158			{
159			// Parse custom exceptions (free.fr, gouv.fr, etc)
160			$rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing
161			foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
162			if (TextUtil::str_ends_with($rawDomain, $end)) {
163			return $this->sanitizeSubDomain($rawDomain);
164			}
165			}
166
167			// Parse using InternetDomainParser library
168			return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data
169			}
170
171			/**
172			* "http://www.bla.co.uk/fubar" => "bla.co.uk"
173			* @throws Exception
174			*/
175			public function getRegistrableSubDomain(): ?string
176			{
177			try {
178			if (!HttpUtil::isHttpURL($this->url)) {
179			throw new Exception('string is not an URL ' . $this->url);
180			}
181			if (!$this->domainParser instanceof InternetDomainParserInterface) {
182			$this->log->notice('InternetDomainParser is not set');
183
184			return null;
185			}
186
187			return $this->domainParser->getRegistrableDomainFromURL($this->url);
188			} catch (Exception $e) {
189			if ($this->log !== null) {
190			$this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url);
191			}
192			throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e);
193			}
194			}
195
196			/**
197			* Extract language from <html lang="en-us"> tag.
198			*/
199			private function parseHtmlLang(string $html): ?string
200			{
201			if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
202			return $matches[1];
203			}
204
205			return null;
206			}
207
208			/**
209			* Extract webpage title from HTML <title>
210			* not foolproof : example <!-- <title>bla</title> -->
211			*/
212			private function parseHtmlTitle(string $html): ?string
213			{
214			if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
215			return trim(strip_tags($matches[1]));
216			}
217
218			return null;
219			}
220
221			/**
222			* Extract first <h1> from HTML.
223			*/
224			private function parseHtmlFirstH1(string $html): ?string
225			{
226			if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
227			return trim(strip_tags($matches[1]));
228			}
229
230			return null;
231			}
232
233			/**
234			* TODO strip not unicode characters ?
235			* TODO add initial capital letter ?
236			* This method is used to sanitize subdomain name.
237			* WTF ?!?!?!
238			*/
239			protected function sanitizeSubDomain(string $subDomain): string
240			{
241			return str_replace('www.', '', $subDomain);
242			}
243
244			/**
245			* Extract robots meta tag content.
246			* <meta name="robots" content="noindex,noarchive">
247			*/
248			private function getMetaRobotsContent(string $html): string
249			{
250			if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) {
251			return $matches[1];
252			}
253
254			return '';
255			}
256			}
257

Dispositif / Wikibot

Issues (106)

src/Domain/ExternLink/ExternPage.php (1 issue)

Labels

Severity

Introduced By

Duplication Side-by-Side

Filter issues like