Passed
Push — master ( 732591...229bc5 )
by Dispositif
03:49
created

CirrusSearch   A

Complexity

Total Complexity 27

Size/Duplication

Total Lines 153
Duplicated Lines 0 %

Test Coverage

Coverage 0%

Importance

Changes 4
Bugs 0 Features 1
Metric Value
wmc 27
eloc 75
c 4
b 0
f 1
dl 0
loc 153
rs 10
ccs 0
cts 28
cp 0

7 Methods

Rating   Name   Duplication   Size   Complexity  
A getURL() 0 15 3
A __construct() 0 3 1
B getPageTitles() 0 29 9
A httpRequest() 0 28 6
A hashSearchParams() 0 10 3
A saveOffsetInFile() 0 9 3
A getOffsetFromFile() 0 9 2
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Infrastructure;
12
13
use App\Application\InfrastructurePorts\HttpClientInterface;
14
use App\Application\InfrastructurePorts\PageListForAppInterface;
15
use App\Domain\Exceptions\ConfigException;
16
use App\Domain\InfrastructurePorts\PageListInterface;
17
use Exception;
18
use GuzzleHttp\Psr7\Response;
19
use InvalidArgumentException;
20
use Throwable;
21
22
/**
23
 * https://www.mediawiki.org/wiki/Help:CirrusSearch
24
 * https://fr.wikipedia.org/w/api.php?action=help&modules=query%2Bsearch
25
 * https://www.mediawiki.org/wiki/Help:CirrusSearch#Insource
26
 * raw https://fr.wikipedia.org/w/api.php?action=query&list=search&srsearch=%22https://books.google%22%20insource:/\%3Cref\%3Ehttps\:\/\/books\.google/&formatversion=2&format=json
27
 * Dirty.
28
 * Class CirrusSearch
29
 */
30
class CirrusSearch implements PageListInterface, PageListForAppInterface
31
{
32
    public const OPTION_CONTINUE = 'continue';
33
    public const OPTION_REVERSE = 'reverse';
34
    public const SRSORT_NONE = 'none';
35
    public const SRSORT_RANDOM = 'random';
36
    public const SRSORT_LAST_EDIT_DESC = 'last_edit_desc';
37
    public const SRQIPROFILE_POPULAR_INCLINKS_PV = 'popular_inclinks_pv'; // nombre de vues de la page :)
38
    public const SRQIPROFILE_DEFAULT = 'engine_autoselect';
39
40
    protected const BASE_URL = 'https://fr.wikipedia.org/w/api.php'; // todo move config
41
    protected const CONTINUE_OFFSET_FILENAME = __DIR__ . '/../../resources/cirrusSearch-{HASH}.txt'; // todo move config
42
43
    protected array $requestParams = [];
44
    protected array $defaultParams
45
        = [
46
            'action' => 'query',
47
            'list' => 'search',
48
            'formatversion' => '2',
49
            'format' => 'json',
50
            'srnamespace' => 0,
51
            'srlimit' => '10', // default 10, max 500
52
            'srprop' => 'size|wordcount|timestamp', // default 'size|wordcount|timestamp|snippet'
53
        ];
54
    protected readonly HttpClientInterface $client;
55
56
    /**
57
     * $options : "continue" => true for continue search
58
     */
59
    public function __construct(protected readonly array $params, protected ?array $options = [])
60
    {
61
        $this->client = ServiceFactory::getHttpClient();
0 ignored issues
show
Bug introduced by
The property client is declared read-only in App\Infrastructure\CirrusSearch.
Loading history...
62
    }
63
64
    /**
65
     * @return array
66
     * @throws ConfigException
67
     */
68
    public function getPageTitles(): array
69
    {
70
        $arrayResp = $this->httpRequest();
71
72
        if ($this->options[self::OPTION_CONTINUE] ?? false) {
73
            $continueOffset = 0;
74
            if (!empty($arrayResp['continue']['sroffset'])) {
75
                $continueOffset = (int)$arrayResp['continue']['sroffset'];
76
            }
77
            $this->saveOffsetInFile($continueOffset);
78
        }
79
80
        if (!isset($arrayResp['query']) || empty($arrayResp['query']['search'])) {
81
            return [];
82
        }
83
        $results = $arrayResp['query']['search'];
84
85
        $titles = [];
86
        foreach ($results as $res) {
87
            if (!empty($res['title'])) {
88
                $titles[] = trim((string)$res['title']); // trim utile ?
89
            }
90
        }
91
92
        if (isset($this->options[self::OPTION_REVERSE]) && $this->options[self::OPTION_REVERSE] === true) {
93
            krsort($titles);
94
        }
95
96
        return $titles;
97
    }
98
99
    /**
100
     * todo Wiki API ?
101
     * @throws ConfigException
102
     * @throws Exception
103
     */
104
    protected function httpRequest(): array
105
    {
106
        $url = $this->getURL();
107
        if ($url === '' || $url === '0') {
108
            throw new ConfigException('CirrusSearch null URL');
109
        }
110
111
        // improve with curl options ?
112
        $response = $this->client->get($url);
113
        /**
114
         * @var $response Response
115
         */
116
        if ($response->getStatusCode() !== 200) {
117
            throw new Exception(
118
                'CirrusSearch error : ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase()
119
            );
120
        }
121
        $json = $response->getBody()->getContents();
122
        if (empty($json)) {
123
            return [];
124
        }
125
        try {
126
            $array = json_decode((string)$json, true, 512, JSON_THROW_ON_ERROR);
127
        } catch (Throwable $e) {
128
            throw new Exception($e->getMessage(), $e->getCode(), $e);
129
        }
130
131
        return $array;
132
    }
133
134
    protected function getURL(): string
135
    {
136
        if (empty($this->params['srsearch'])) {
137
            throw new InvalidArgumentException('No "srsearch" argument in params.');
138
        }
139
140
        $this->requestParams = array_merge($this->defaultParams, $this->params);
141
        if ($this->options[self::OPTION_CONTINUE] ?? false) {
142
            $this->requestParams['sroffset'] = $this->getOffsetFromFile($this->requestParams);
143
            //echo sprintf("Extract offset %s from file \n", $this->requestParams['sroffset']);
144
        }
145
        // RFC3986 : space => %20
146
        $query = http_build_query($this->requestParams, 'bla', '&', PHP_QUERY_RFC3986);
147
148
        return self::BASE_URL . '?' . $query;
149
    }
150
151
    protected function getOffsetFromFile(array $allParams): int
152
    {
153
        $hash = $this->hashSearchParams($allParams);
154
        $file = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME);
155
        if (!file_exists($file)) {
156
            return 0;
157
        }
158
159
        return (int)trim(file_get_contents($file));
160
    }
161
162
    protected function hashSearchParams(array $params): string
163
    {
164
        if (empty($params)) {
165
            throw new InvalidArgumentException('No search argument in params.');
166
        }
167
        if (isset($params['sroffset'])) {
168
            unset($params['sroffset']);
169
        }
170
171
        return md5(implode('', $params));
172
    }
173
174
    protected function saveOffsetInFile(int $continueOffset = 0): void
175
    {
176
        $hash = $this->hashSearchParams($this->requestParams);
177
        $offsetFilename = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME);
178
179
        if ($continueOffset === 0 && file_exists($offsetFilename)) {
180
            @unlink($offsetFilename);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for unlink(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

180
            /** @scrutinizer ignore-unhandled */ @unlink($offsetFilename);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
181
        } else {
182
            file_put_contents($offsetFilename, $continueOffset);
183
        }
184
    }
185
}
186