Passed
Push — master ( 8e7e6d...9f9e2a )
by Dispositif
03:39
created

DeadLinkTransformer::formatFromUrl()   B

Complexity

Conditions 7
Paths 14

Size

Total Lines 29
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
eloc 15
c 2
b 0
f 0
dl 0
loc 29
rs 8.8333
cc 7
nc 14
nop 2
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\ExternLink;
11
12
use App\Domain\InfrastructurePorts\DeadlinkArchiverInterface;
13
use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
14
use App\Domain\Models\Summary;
15
use App\Domain\Models\WebarchiveDTO;
16
use App\Domain\Publisher\ExternMapper;
17
use App\Infrastructure\InternetDomainParser;
18
use App\Infrastructure\Monitor\NullLogger;
19
use App\Infrastructure\ServiceFactory;
20
use DateTimeImmutable;
21
use DateTimeInterface;
22
use Psr\Log\LoggerInterface;
23
24
/**
25
 * Transform dead link url in {lien brisé} or import web archive URL
26
 */
27
class DeadLinkTransformer
28
{
29
    private const USE_TOR_FOR_ARCHIVE = false;
30
    private const DELAY_PARSE_ARCHIVE = 3;
31
    private const REPLACE_RAW_WIKIWIX_BY_LIENWEB = false;
32
33
    /**
34
     * @param DeadlinkArchiverInterface[] $archivers
35
     */
36
    public function __construct(
37
        protected array                          $archivers = [],
38
        protected ?InternetDomainParserInterface $domainParser = null,
39
        protected ?ExternRefTransformerInterface $externRefTransformer = null,
40
        protected LoggerInterface                $log = new NullLogger()
41
    )
42
    {
43
    }
44
45
    public function formatFromUrl(string $url, DateTimeInterface $now = new DateTimeImmutable()): string
46
    {
47
        // choose randomly one archiver
48
        // TODO choose IA sur url IA, Wikiwix sur url Wikiwix, etc
49
        $oneArchiver = !empty($this->archivers) ? $this->archivers[array_rand($this->archivers)] : null;
50
51
        // HACK : Temporary skip transform on archiver URL (éviter archive IA sur url Wikiwix)
52
        if ($this->isWebArchiveUrl($url)) {
53
            $this->log->notice('Skip {lien brisé} on web archive url', ['stats' => 'externref.skip.lienBriseOnwebarchiveurl']);
54
            return $url;
55
        }
56
57
        if ($oneArchiver instanceof DeadlinkArchiverInterface) {
58
            $webarchiveDTO = $oneArchiver->searchWebarchive($url);
59
            if ($webarchiveDTO instanceof WebarchiveDTO) {
60
                if ($webarchiveDTO->getArchiver() === '[[Wikiwix]]') {
61
                    $this->log->notice('🥝 Wikiwix found');
62
                }
63
                if ($webarchiveDTO->getArchiver() === '[[Internet Archive]]') {
64
                    $this->log->notice('🏛️ InternetArchive found');
65
                }
66
                $this->log->debug('archive url: ' . $webarchiveDTO->getArchiveUrl());
67
68
                return $this->generateLienWebFromArchive($webarchiveDTO);
69
            }
70
            $this->log->notice('web archive not found');
71
        }
72
73
        return $this->generateLienBrise($url, $now);
74
    }
75
76
    private function generateLienWebFromArchive(WebarchiveDTO $dto): string
77
    {
78
        sleep(self::DELAY_PARSE_ARCHIVE);
79
80
        $externRefProcessOnArchive = $this->externRefProcessOnArchive($dto);
81
82
        // Wikiwix : "Sorry, this system is overloaded. Please come back in a minute."
83
        // manage content-type 'application/pdf' which is not parsed by ExternRefTransformer
84
        if (
85
            self::REPLACE_RAW_WIKIWIX_BY_LIENWEB
86
            && str_starts_with($externRefProcessOnArchive, 'https://archive.wikiwix.com/cache/')
87
        ) {
88
            $this->log->notice('Replace raw wikiwix by lien web');
89
90
            return sprintf(
91
                '{{Lien web |url= %s |titre=%s |site= %s |consulté le=%s |archive-date=%s}}',
92
                $dto->getArchiveUrl(),
93
                'Archive ' . $this->generateTitleFromURLText($dto->getOriginalUrl()) . '<!-- titre à compléter -->',
94
                'via ' . $dto->getArchiver(),
95
                date('d-m-Y'),
96
                $dto->getArchiveDate() instanceof DateTimeInterface ? $dto->getArchiveDate()->format('d-m-Y') : ''
97
            );
98
        }
99
100
        return $externRefProcessOnArchive;
101
    }
102
103
    /**
104
     * To extract the title+author+lang+… from the webarchive page.
105
     */
106
    private function externRefProcessOnArchive(WebarchiveDTO $dto): string
107
    {
108
        $summary = new Summary('test');
109
        if (!$this->externRefTransformer instanceof ExternRefTransformerInterface) {
110
            $this->externRefTransformer = new ExternRefTransformer(
111
                new ExternMapper($this->log),
112
                ServiceFactory::getHttpClient(self::USE_TOR_FOR_ARCHIVE),
113
                new InternetDomainParser(),
114
                $this->log,
115
            ); // todo inverse dependency
116
        }
117
118
        $options = $this->domainParser instanceof InternetDomainParserInterface
119
            ? ['originalRegistrableDomain' => $this->domainParser->getRegistrableDomainFromURL($dto->getOriginalUrl())]
120
            : [];
121
122
        return $this->externRefTransformer->process($dto->getArchiveUrl(), $summary, $options);
0 ignored issues
show
Bug introduced by
The method process() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

122
        return $this->externRefTransformer->/** @scrutinizer ignore-call */ process($dto->getArchiveUrl(), $summary, $options);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
123
    }
124
125
    protected function generateTitleFromURLText(string $url): string
126
    {
127
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
128
        if (strlen($text) > 30) {
129
            $text = substr($text, 0, 30) . '…';
130
        }
131
132
        return $text;
133
    }
134
135
    protected function generateLienBrise(string $url, DateTimeInterface $now): string
136
    {
137
        if ($this->isWebArchiveUrl($url)) {
138
            $this->notice('Skip {lien brisé} on web archive url', ['stats' => 'externref.skip.lienBriseOnwebarchiveurl']);
0 ignored issues
show
Bug introduced by
The method notice() does not exist on App\Domain\ExternLink\DeadLinkTransformer. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

138
            $this->/** @scrutinizer ignore-call */ 
139
                   notice('Skip {lien brisé} on web archive url', ['stats' => 'externref.skip.lienBriseOnwebarchiveurl']);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
139
140
            return $url;
141
        }
142
143
        return sprintf(
144
            '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
145
            $this->stripWebArchivePrefix($url),
146
            $this->generateTitleFromURLText($url),
147
            $now->format('d-m-Y')
148
        );
149
    }
150
151
    /**
152
     * Bug https://w.wiki/7kUm
153
     */
154
    private function stripWebArchivePrefix(string $url): string
155
    {
156
        $url = preg_replace('#^https?://web\.archive\.org/web/\d+/#', '', $url);
157
        $url = preg_replace('#^https?://archive\.is/\d+/#', '', $url);
158
159
        return preg_replace('#^https?://archive\.wikiwix\.com/cache/\d+/#', '', $url);
160
    }
161
162
    /**
163
     * todo move
164
     */
165
    protected function isWebArchiveUrl(string $url): bool
166
    {
167
        return str_starts_with($url, 'http://web.archive.org/web/')
168
            || str_starts_with($url, 'https://web.archive.org/web/')
169
            || str_starts_with($url, 'https://archive.is/')
170
            || str_starts_with($url, 'https://archive.wikiwix.com/cache/');
171
    }
172
}