Passed
Push — master ( f2b4f1...f4fa9e )
by Dispositif
03:37
created

ExternPageFactory::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 2
Code Lines 0

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 0
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\ExternLink;
11
12
use App\Application\InfrastructurePorts\HttpClientInterface;
13
use App\Application\Utils\HttpUtil;
14
use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
15
use App\Infrastructure\TagParser;
16
use DomainException;
17
use Exception;
18
use Psr\Log\LoggerInterface;
19
use Psr\Log\NullLogger;
20
21
class ExternPageFactory
22
{
23
    public function __construct(protected HttpClientInterface $client, protected LoggerInterface $log = new NullLogger())
24
    {
25
    }
26
27
    /**
28
     * @throws Exception
29
     */
30
    public function fromURL(string $url, InternetDomainParserInterface $domainParser): ExternPage
31
    {
32
        if (!HttpUtil::isHttpURL($url)) {
33
            throw new Exception('string is not an URL ' . $url);
34
        }
35
36
        $html = $this->getHTML($url, true);
37
        if (empty($html)) {
38
            throw new DomainException('No HTML from requested URL ' . $url);
39
        }
40
41
        return new ExternPage($url, $html, new TagParser(), $domainParser, $this->log);
42
    }
43
44
    /**
45
     * import source from URL with Guzzle.
46
     * todo abstract + refac async request
47
     */
48
    public function getHTML(string $url, ?bool $normalized = false): ?string
49
    {
50
        // todo : check banned domains ?
51
        // todo : check DNS record => ban ?
52
        // todo : accept non-ascii URL ?
53
        // idn_to_ascii($url);
54
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
55
        // checkdnsrr($string, "A") // check DNS record
56
        if (!HttpUtil::isHttpURL($url)) {
57
            throw new DomainException('URL not compatible : ' . $url);
58
        }
59
        $response = $this->client->get($url, [
60
            'timeout' => 20,
61
            'allow_redirects' => true, /* note : marche pas en mode proxy Tor */
62
            'headers' => ['User-Agent' => getenv('USER_AGENT')],
63
            'verify' => false,
64
//            'http_errors' => true, // TRUE: Exception on 4xx 5xx
65
        ]);
66
67
        if (200 !== $response->getStatusCode()) {
68
            $this->log->error('[z49] HTTP error ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase());
69
70
            return null;
71
        }
72
73
        $contentType = $response->getHeader('Content-Type');
74
        if (in_array('application/pdf', explode(';', $contentType[0]))) {
75
            $this->log->debug('Incompatible application/pdf content-type');
76
            return null;
77
        }
78
79
        $html = $response->getBody()->getContents() ?? '';
80
81
        return ($normalized) ? HttpUtil::normalizeHtml($html, $url) : $html;
82
    }
83
}
84