Issues (106)

src/Infrastructure/CirrusSearch.php (2 issues)

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Infrastructure;
12
13
use App\Application\InfrastructurePorts\HttpClientInterface;
14
use App\Application\InfrastructurePorts\PageListForAppInterface;
15
use App\Domain\Exceptions\ConfigException;
16
use App\Domain\InfrastructurePorts\PageListInterface;
17
use Exception;
18
use GuzzleHttp\Psr7\Response;
19
use InvalidArgumentException;
20
use Throwable;
21
22
/**
23
 * TODO injecter API session sinon limité à 500 results !
24
 * https://www.mediawiki.org/wiki/Help:CirrusSearch
25
 * https://fr.wikipedia.org/w/api.php?action=help&modules=query%2Bsearch
26
 * https://www.mediawiki.org/wiki/Help:CirrusSearch#Insource
27
 * raw https://fr.wikipedia.org/w/api.php?action=query&list=search&srsearch=%22https://books.google%22%20insource:/\%3Cref\%3Ehttps\:\/\/books\.google/&formatversion=2&format=json
28
 * Dirty.
29
 * Class CirrusSearch
30
 */
31
class CirrusSearch implements PageListInterface, PageListForAppInterface
32
{
33
    public const OPTION_CONTINUE = 'continue';
34
    public const OPTION_REVERSE = 'reverse';
35
    public const OPTION_APILOGIN = 'apilogin';
36
    public const SRSORT_NONE = 'none';
37
    public const SRSORT_RANDOM = 'random';
38
    public const SRSORT_LAST_EDIT_DESC = 'last_edit_desc';
39
    public const SRQIPROFILE_POPULAR_INCLINKS_PV = 'popular_inclinks_pv'; // nombre de vues de la page :)
40
    public const SRQIPROFILE_DEFAULT = 'engine_autoselect';
41
42
    protected const BASE_URL = 'https://fr.wikipedia.org/w/api.php'; // todo move config
43
    protected const CONTINUE_OFFSET_FILENAME = __DIR__ . '/../../resources/cirrusSearch-{HASH}.txt'; // todo move config
44
45
    protected array $requestParams = [];
46
    protected array $defaultParams
47
        = [
48
            'action' => 'query',
49
            'list' => 'search',
50
            'formatversion' => '2',
51
            'format' => 'json',
52
            'srnamespace' => 0,
53
            'srlimit' => '500', // max 500 péon, 5000 bot/admin
54
            'srprop' => 'size|wordcount|timestamp', // default 'size|wordcount|timestamp|snippet'
55
        ];
56
    protected readonly HttpClientInterface $client;
57
58
    /**
59
     * $options : "continue" => true for continue search
60
     */
61
    public function __construct(protected readonly array $params, protected ?array $options = [])
62
    {
63
        $this->client = ServiceFactory::getHttpClient();
0 ignored issues
show
The property client is declared read-only in App\Infrastructure\CirrusSearch.
Loading history...
64
    }
65
66
    /**
67
     * @return array
68
     * @throws ConfigException
69
     */
70
    public function getPageTitles(): array
71
    {
72
        $arrayResp = $this->httpRequest();
73
74
        if ($this->options[self::OPTION_CONTINUE] ?? false) {
75
            $continueOffset = 0;
76
            if (!empty($arrayResp['continue']['sroffset'])) {
77
                $continueOffset = (int)$arrayResp['continue']['sroffset'];
78
            }
79
            $this->saveOffsetInFile($continueOffset);
80
        }
81
82
        if (!isset($arrayResp['query']) || empty($arrayResp['query']['search'])) {
83
            return [];
84
        }
85
        $results = $arrayResp['query']['search'];
86
87
        $titles = [];
88
        foreach ($results as $res) {
89
            if (!empty($res['title'])) {
90
                $titles[] = trim((string)$res['title']); // trim utile ?
91
            }
92
        }
93
94
        if (isset($this->options[self::OPTION_REVERSE]) && $this->options[self::OPTION_REVERSE] === true) {
95
            krsort($titles);
96
        }
97
98
        return $titles;
99
    }
100
101
    /**
102
     * todo Wiki API ?
103
     * @throws ConfigException
104
     * @throws Exception
105
     */
106
    protected function httpRequest(): array
107
    {
108
        $url = $this->getURL();
109
        if ($url === '' || $url === '0') {
110
            throw new ConfigException('CirrusSearch null URL');
111
        }
112
113
        // improve with curl options ?
114
        $response = $this->client->get($url); // TODO refac with wiki API login
115
        /**
116
         * @var $response Response
117
         */
118
        if ($response->getStatusCode() !== 200) {
119
            throw new Exception(
120
                'CirrusSearch error : ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase()
121
            );
122
        }
123
        $json = $response->getBody()->getContents();
124
        if (empty($json)) {
125
            return [];
126
        }
127
        try {
128
            $array = json_decode((string)$json, true, 512, JSON_THROW_ON_ERROR);
129
        } catch (Throwable $e) {
130
            throw new Exception($e->getMessage(), $e->getCode(), $e);
131
        }
132
133
        return $array;
134
    }
135
136
    protected function getURL(): string
137
    {
138
        if (empty($this->params['srsearch'])) {
139
            throw new InvalidArgumentException('No "srsearch" argument in params.');
140
        }
141
142
        $this->requestParams = array_merge($this->defaultParams, $this->params);
143
        if ($this->options[self::OPTION_CONTINUE] ?? false) {
144
            $this->requestParams['sroffset'] = $this->getOffsetFromFile($this->requestParams);
145
            echo sprintf("Extract offset %s from file \n", $this->requestParams['sroffset']);
146
        }
147
        // RFC3986 : space => %20
148
        $query = http_build_query($this->requestParams, 'bla', '&', PHP_QUERY_RFC3986);
149
150
        return self::BASE_URL . '?' . $query;
151
    }
152
153
    protected function getOffsetFromFile(array $allParams): int
154
    {
155
        $hash = $this->hashSearchParams($allParams);
156
        $file = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME);
157
        if (!file_exists($file)) {
158
            return 0;
159
        }
160
161
        return (int)trim(file_get_contents($file));
162
    }
163
164
    protected function hashSearchParams(array $params): string
165
    {
166
        if (empty($params)) {
167
            throw new InvalidArgumentException('No search argument in params.');
168
        }
169
        if (isset($params['sroffset'])) {
170
            unset($params['sroffset']);
171
        }
172
173
        return md5(implode('', $params));
174
    }
175
176
    protected function saveOffsetInFile(int $continueOffset = 0): void
177
    {
178
        $hash = $this->hashSearchParams($this->requestParams);
179
        $offsetFilename = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME);
180
181
        if ($continueOffset === 0 && file_exists($offsetFilename)) {
182
            @unlink($offsetFilename);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for unlink(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

182
            /** @scrutinizer ignore-unhandled */ @unlink($offsetFilename);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
183
        } else {
184
            file_put_contents($offsetFilename, $continueOffset);
185
        }
186
    }
187
}
188