Passed
Push — master ( 860b7e...86fe81 )
by Dispositif
03:31
created

DeadLinkTransformer::stripWebArchivePrefix()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 3
dl 0
loc 6
rs 10
c 1
b 0
f 0
cc 1
nc 1
nop 1
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\ExternLink;
11
12
use App\Domain\InfrastructurePorts\DeadlinkArchiverInterface;
13
use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
14
use App\Domain\Models\Summary;
15
use App\Domain\Models\WebarchiveDTO;
16
use App\Domain\Publisher\ExternMapper;
17
use App\Infrastructure\InternetDomainParser;
18
use App\Infrastructure\ServiceFactory;
19
use DateTimeImmutable;
20
use DateTimeInterface;
21
use Psr\Log\LoggerInterface;
22
use Psr\Log\NullLogger;
23
24
/**
25
 * Transform dead link url in {lien brisé} or import web archive URL
26
 */
27
class DeadLinkTransformer
28
{
29
    private const USE_TOR_FOR_ARCHIVE = false;
30
    private const DELAY_PARSE_ARCHIVE = 3;
31
    private const REPLACE_RAW_WIKIWIX_BY_LIENWEB = false;
32
33
    /**
34
     * @param DeadlinkArchiverInterface[] $archivers
35
     */
36
    public function __construct(
37
        protected array                          $archivers = [],
38
        protected ?InternetDomainParserInterface $domainParser = null,
39
        protected ?ExternRefTransformerInterface $externRefTransformer = null,
40
        protected LoggerInterface                $log = new NullLogger()
41
    )
42
    {
43
    }
44
45
    public function formatFromUrl(string $url, DateTimeInterface $now = new DateTimeImmutable()): string
46
    {
47
        // choose randomly one archiver
48
        $oneArchiver = !empty($this->archivers) ? $this->archivers[array_rand($this->archivers)] : null;
49
50
        if ($oneArchiver instanceof DeadlinkArchiverInterface) {
51
            $webarchiveDTO = $oneArchiver->searchWebarchive($url);
52
            if ($webarchiveDTO instanceof WebarchiveDTO) {
53
                if ($webarchiveDTO->getArchiver() === '[[Wikiwix]]') {
54
                    $this->log->notice('🥝 Wikiwix found');
55
                }
56
                if ($webarchiveDTO->getArchiver() === '[[Internet Archive]]') {
57
                    $this->log->notice('🏛️ InternetArchive found');
58
                }
59
                $this->log->debug('archive url: ' . $webarchiveDTO->getArchiveUrl());
60
61
                return $this->generateLienWebFromArchive($webarchiveDTO);
62
            }
63
            $this->log->notice('web archive not found');
64
        }
65
66
        return $this->generateLienBrise($url, $now);
67
    }
68
69
    private function generateLienWebFromArchive(WebarchiveDTO $dto): string
70
    {
71
        sleep(self::DELAY_PARSE_ARCHIVE);
72
73
        $externRefProcessOnArchive = $this->externRefProcessOnArchive($dto);
74
75
        // Wikiwix : "Sorry, this system is overloaded. Please come back in a minute."
76
        // manage content-type 'application/pdf' which is not parsed by ExternRefTransformer
77
        if (
78
            self::REPLACE_RAW_WIKIWIX_BY_LIENWEB
79
            && str_starts_with($externRefProcessOnArchive, 'https://archive.wikiwix.com/cache/')
80
        ) {
81
            $this->log->notice('Replace raw wikiwix by lien web');
82
83
            return sprintf(
84
                '{{Lien web |url= %s |titre=%s |site= %s |consulté le=%s |archive-date=%s}}',
85
                $dto->getArchiveUrl(),
86
                'Archive ' . $this->generateTitleFromURLText($dto->getOriginalUrl()) . '<!-- titre à compléter -->',
87
                'via ' . $dto->getArchiver(),
88
                date('d-m-Y'),
89
                $dto->getArchiveDate() instanceof DateTimeInterface ? $dto->getArchiveDate()->format('d-m-Y') : ''
90
            );
91
        }
92
93
        return $externRefProcessOnArchive;
94
    }
95
96
    /**
97
     * To extract the title+author+lang+… from the webarchive page.
98
     */
99
    private function externRefProcessOnArchive(WebarchiveDTO $dto): string
100
    {
101
        $summary = new Summary('test');
102
        if (!$this->externRefTransformer instanceof ExternRefTransformerInterface) {
103
            $this->externRefTransformer = new ExternRefTransformer(
104
                new ExternMapper($this->log),
105
                ServiceFactory::getHttpClient(self::USE_TOR_FOR_ARCHIVE),
106
                new InternetDomainParser(),
107
                $this->log,
108
            ); // todo inverse dependency
109
        }
110
111
        $options = $this->domainParser instanceof InternetDomainParserInterface
112
            ? ['originalRegistrableDomain' => $this->domainParser->getRegistrableDomainFromURL($dto->getOriginalUrl())]
113
            : [];
114
115
        return $this->externRefTransformer->process($dto->getArchiveUrl(), $summary, $options);
0 ignored issues
show
Bug introduced by
The method process() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

115
        return $this->externRefTransformer->/** @scrutinizer ignore-call */ process($dto->getArchiveUrl(), $summary, $options);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
116
    }
117
118
    protected function generateTitleFromURLText(string $url): string
119
    {
120
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
121
        if (strlen($text) > 30) {
122
            $text = substr($text, 0, 30) . '…';
123
        }
124
125
        return $text;
126
    }
127
128
    protected function generateLienBrise(string $url, DateTimeInterface $now): string
129
    {
130
        return sprintf(
131
            '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
132
            $this->stripWebArchivePrefix($url),
133
            $this->generateTitleFromURLText($url),
134
            $now->format('d-m-Y')
135
        );
136
    }
137
138
    /**
139
     * Bug https://w.wiki/7kUm
140
     */
141
    private function stripWebArchivePrefix(string $url): string
142
    {
143
        $url = preg_replace('#^https?://web\.archive\.org/web/\d+/#', '', $url);
144
        $url = preg_replace('#^https?://archive\.is/\d+/#', '', $url);
145
146
        return preg_replace('#^https?://archive\.wikiwix\.com/cache/\d+/#', '', $url);
147
    }
148
}