ExternHttpClient::getClient() - Code Metrics - Inspection of "Add webarchive (wikiwix) adapter and transform" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( cd664b...7baf30 )

by Dispositif

created 2023-10-04 14:14 UTC

ExternHttpClient::getClient() A

↳ Parent: ExternHttpClient

Complexity

Conditions	1
Paths	1

Size

Total Lines	3
Code Lines	1

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	2

Importance

Changes

Metric	Value
eloc	1
dl	0
loc	3
rs	10
c	0
b	0
f	0
ccs	0
cts	0
cp	0
cc	1
nc	1
nop	0
crap	2

<?php

/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */
declare(strict_types=1);

namespace App\Application\Http;

use App\Domain\InfrastructurePorts\ExternHttpClientInterface;
use DomainException;
use GuzzleHttp\Client;
use Normalizer;
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;
use Throwable;

/**
 * Http client (Guzzle) configured for web crawling.
 */
class ExternHttpClient implements ExternHttpClientInterface
{
    private readonly Client $client;

    public function __construct(private readonly LoggerInterface $log = new NullLogger())
    {
        $this->client = new Client(

            [
                'timeout' => 30,
                'allow_redirects' => true,
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
                'verify' => false, // CURLOPT_SSL_VERIFYHOST
                //                'proxy'           => '192.192.192.192:10',
            ]
        );
    }

    //hack for WikiwixAdapter todo : plutôt request() compatible ClientInterface ?
    public function getClient(): Client
    {
        return $this->client;
    }

    /**
     * import source from URL with Guzzle.
     * todo abstract + refac async request
     */
    public function getHTML(string $url, ?bool $normalized = false): ?string
    {
        // todo : check banned domains ?
        // todo : check DNS record => ban ?
        // todo : accept non-ascii URL ?
        // idn_to_ascii($url);
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
        // checkdnsrr($string, "A") // check DNS record
        if (!self::isHttpURL($url)) {
            throw new DomainException('URL not compatible : ' . $url);
        }
        $response = $this->client->get($url);

        if (200 !== $response->getStatusCode()) {
            echo 'HTTP error ' . $response->getStatusCode();
            $this->log->error('HTTP error ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase());

            return null;
        }

        $html = $response->getBody()->getContents() ?? '';

        return ($normalized) ? $this->normalizeHtml($html, $url) : $html;
    }

    /**
     * Todo Move HttpUtils ?
     * Better than filter_var($url, FILTER_VALIDATE_URL) because it's not multibyte capable.
     * See for example .中国 domain name
     */
    public static function isHttpURL(string $url): bool
    {
        return (bool)preg_match('#^https?://[^ ]+$#i', $url);
    }

    /**
     * Normalize and converting to UTF-8 encoding
     */
    private function normalizeHtml(string $html, ?string $url = ''): ?string
    {
        $e = null;
        if (empty($html)) {
            return $html;
        }

        $html2 = Normalizer::normalize($html);

        if (is_string($html2) && !empty($html2)) {
            return $html2;
        }

        $charset = $this->extractCharset($html) ?? 'WINDOWS-1252';

        if (empty($charset)) {
            throw new DomainException('normalized html error and no charset found : ' . $url);
        }
        try {
            $html2 = iconv($charset, 'UTF-8//TRANSLIT', $html);
            if (false === $html2) {
                throw new DomainException("error iconv : $charset to UTF-8 on " . $url);
            }
            $html2 = Normalizer::normalize($html2);
            if (!is_string($html2)) {

                throw new DomainException("error normalizer : $charset to UTF-8 on " . $url);
            }
        } catch (Throwable $e) {
            throw new DomainException("error converting : $charset to UTF-8 on " . $url, $e->getCode(), $e);
        }

        return $html2;
    }

    /**
     * Extract charset from HTML text
     */
    private function extractCharset(string $html): ?string
    {
        if (preg_match(
            '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#',
            $html,
            $matches
        )
        ) {
            $charset = $matches[2] ?? $matches[1] ?? null;
        }
        if (empty($charset)) {

            $encoding = mb_detect_encoding($html, null, true);
            $charset = is_string($encoding) ? strtoupper($encoding) : null;

        }

        return $charset;
    }
}


1		<?php
2
3		/*
4		* This file is part of dispositif/wikibot application (@github)
5		* 2019-2023 © Philippe M./Irønie <[email protected]>
6		* For the full copyright and MIT license information, view the license file.
7		*/
8		declare(strict_types=1);
9
10		namespace App\Application\Http;
11
12		use App\Domain\InfrastructurePorts\ExternHttpClientInterface;
13		use DomainException;
14		use GuzzleHttp\Client;
15		use Normalizer;
16		use Psr\Log\LoggerInterface;
17		use Psr\Log\NullLogger;
18		use Throwable;
19
20		/**
21		* Http client (Guzzle) configured for web crawling.
22		*/
23		class ExternHttpClient implements ExternHttpClientInterface
24		{
25		private readonly Client $client;
26
27		public function __construct(private readonly LoggerInterface $log = new NullLogger())
28		{
29		$this->client = new Client(
		0 ignored issues – show Bug introduced 2023-04-30 09:21 UTC by Report Bug Copy Issue Report The property `client` is declared read-only in `App\Application\Http\ExternHttpClient`. Loading history...
30		[
31		'timeout' => 30,
32		'allow_redirects' => true,
33		'headers' => ['User-Agent' => getenv('USER_AGENT')],
34		'verify' => false, // CURLOPT_SSL_VERIFYHOST
35		// 'proxy' => '192.192.192.192:10',
36		]
37		);
38		}
39
40		//hack for WikiwixAdapter todo : plutôt request() compatible ClientInterface ?
41		public function getClient(): Client
42		{
43		return $this->client;
44		}
45
46		/**
47		* import source from URL with Guzzle.
48		* todo abstract + refac async request
49		*/
50		public function getHTML(string $url, ?bool $normalized = false): ?string
51		{
52		// todo : check banned domains ?
53		// todo : check DNS record => ban ?
54		// todo : accept non-ascii URL ?
55		// idn_to_ascii($url);
56		// idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
57		// checkdnsrr($string, "A") // check DNS record
58		if (!self::isHttpURL($url)) {
59		throw new DomainException('URL not compatible : ' . $url);
60		}
61		$response = $this->client->get($url);
62
63		if (200 !== $response->getStatusCode()) {
64		echo 'HTTP error ' . $response->getStatusCode();
65		$this->log->error('HTTP error ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase());
66
67		return null;
68		}
69
70		$html = $response->getBody()->getContents() ?? '';
71
72		return ($normalized) ? $this->normalizeHtml($html, $url) : $html;
73	6	}
74
75	6	/**
76	1	* Todo Move HttpUtils ?
77		* Better than filter_var($url, FILTER_VALIDATE_URL) because it's not multibyte capable.
78		* See for example .中国 domain name
79	6	*/
80		public static function isHttpURL(string $url): bool
81		{
82		return (bool)preg_match('#^https?://[^ ]+$#i', $url);
83		}
84
85		/**
86		* Normalize and converting to UTF-8 encoding
87		*/
88		private function normalizeHtml(string $html, ?string $url = ''): ?string
89		{
90		$e = null;
91		if (empty($html)) {
92		return $html;
93		}
94
95		$html2 = Normalizer::normalize($html);
96
97		if (is_string($html2) && !empty($html2)) {
98		return $html2;
99		}
100
101		$charset = $this->extractCharset($html) ?? 'WINDOWS-1252';
102
103		if (empty($charset)) {
104		throw new DomainException('normalized html error and no charset found : ' . $url);
105		}
106		try {
107		$html2 = iconv($charset, 'UTF-8//TRANSLIT', $html);
108		if (false === $html2) {
109		throw new DomainException("error iconv : $charset to UTF-8 on " . $url);
110		}
111		$html2 = Normalizer::normalize($html2);
112		if (!is_string($html2)) {
		0 ignored issues – show introduced 2020-11-04 13:31 UTC by Report Bug Copy Issue Report The condition `is_string($html2)` is always `true`. Loading history...
113		throw new DomainException("error normalizer : $charset to UTF-8 on " . $url);
114		}
115		} catch (Throwable $e) {
116		throw new DomainException("error converting : $charset to UTF-8 on " . $url, $e->getCode(), $e);
117		}
118
119		return $html2;
120		}
121
122		/**
123		* Extract charset from HTML text
124		*/
125		private function extractCharset(string $html): ?string
126		{
127		if (preg_match(
128		'#<meta(?!\s(?:name\|value)\s=)(?:[^>]?content\s=[\s"\'])?([^>]?)[\s"\';]charset\s=[\s"\']([^\s"\'/>])#',
129		$html,
130		$matches
131		)
132		) {
133		$charset = $matches[2] ?? $matches[1] ?? null;
134		}
135		if (empty($charset)) {
136
137		$encoding = mb_detect_encoding($html, null, true);
138		$charset = is_string($encoding) ? strtoupper($encoding) : null;
		0 ignored issues – show introduced 2020-08-04 16:23 UTC by Report Bug Copy Issue Report The condition `is_string($encoding)` is always `true`. Loading history...
139		}
140
141		return $charset;
142		}
143		}
144

Dispositif / Wikibot

Push — master ( cd664b...7baf30 )

ExternHttpClient::getClient() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like