1 | <?php |
||||
2 | /* |
||||
3 | * This file is part of dispositif/wikibot application (@github) |
||||
4 | * 2019-2023 © Philippe M./Irønie <[email protected]> |
||||
5 | * For the full copyright and MIT license information, view the license file. |
||||
6 | */ |
||||
7 | |||||
8 | declare(strict_types=1); |
||||
9 | |||||
10 | |||||
11 | namespace App\Infrastructure; |
||||
12 | |||||
13 | use App\Application\InfrastructurePorts\HttpClientInterface; |
||||
14 | use App\Application\InfrastructurePorts\PageListForAppInterface; |
||||
15 | use App\Domain\Exceptions\ConfigException; |
||||
16 | use App\Domain\InfrastructurePorts\PageListInterface; |
||||
17 | use Exception; |
||||
18 | use GuzzleHttp\Psr7\Response; |
||||
19 | use InvalidArgumentException; |
||||
20 | use Throwable; |
||||
21 | |||||
22 | /** |
||||
23 | * TODO injecter API session sinon limité à 500 results ! |
||||
24 | * https://www.mediawiki.org/wiki/Help:CirrusSearch |
||||
25 | * https://fr.wikipedia.org/w/api.php?action=help&modules=query%2Bsearch |
||||
26 | * https://www.mediawiki.org/wiki/Help:CirrusSearch#Insource |
||||
27 | * raw https://fr.wikipedia.org/w/api.php?action=query&list=search&srsearch=%22https://books.google%22%20insource:/\%3Cref\%3Ehttps\:\/\/books\.google/&formatversion=2&format=json |
||||
28 | * Dirty. |
||||
29 | * Class CirrusSearch |
||||
30 | */ |
||||
31 | class CirrusSearch implements PageListInterface, PageListForAppInterface |
||||
32 | { |
||||
33 | public const OPTION_CONTINUE = 'continue'; |
||||
34 | public const OPTION_REVERSE = 'reverse'; |
||||
35 | public const OPTION_APILOGIN = 'apilogin'; |
||||
36 | public const SRSORT_NONE = 'none'; |
||||
37 | public const SRSORT_RANDOM = 'random'; |
||||
38 | public const SRSORT_LAST_EDIT_DESC = 'last_edit_desc'; |
||||
39 | public const SRQIPROFILE_POPULAR_INCLINKS_PV = 'popular_inclinks_pv'; // nombre de vues de la page :) |
||||
40 | public const SRQIPROFILE_DEFAULT = 'engine_autoselect'; |
||||
41 | |||||
42 | protected const BASE_URL = 'https://fr.wikipedia.org/w/api.php'; // todo move config |
||||
43 | protected const CONTINUE_OFFSET_FILENAME = __DIR__ . '/../../resources/cirrusSearch-{HASH}.txt'; // todo move config |
||||
44 | |||||
45 | protected array $requestParams = []; |
||||
46 | protected array $defaultParams |
||||
47 | = [ |
||||
48 | 'action' => 'query', |
||||
49 | 'list' => 'search', |
||||
50 | 'formatversion' => '2', |
||||
51 | 'format' => 'json', |
||||
52 | 'srnamespace' => 0, |
||||
53 | 'srlimit' => '500', // max 500 péon, 5000 bot/admin |
||||
54 | 'srprop' => 'size|wordcount|timestamp', // default 'size|wordcount|timestamp|snippet' |
||||
55 | ]; |
||||
56 | protected readonly HttpClientInterface $client; |
||||
57 | |||||
58 | /** |
||||
59 | * $options : "continue" => true for continue search |
||||
60 | */ |
||||
61 | public function __construct(protected readonly array $params, protected ?array $options = []) |
||||
62 | { |
||||
63 | $this->client = ServiceFactory::getHttpClient(); |
||||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||||
64 | } |
||||
65 | |||||
66 | /** |
||||
67 | * @return array |
||||
68 | * @throws ConfigException |
||||
69 | */ |
||||
70 | public function getPageTitles(): array |
||||
71 | { |
||||
72 | $arrayResp = $this->httpRequest(); |
||||
73 | |||||
74 | if ($this->options[self::OPTION_CONTINUE] ?? false) { |
||||
75 | $continueOffset = 0; |
||||
76 | if (!empty($arrayResp['continue']['sroffset'])) { |
||||
77 | $continueOffset = (int)$arrayResp['continue']['sroffset']; |
||||
78 | } |
||||
79 | $this->saveOffsetInFile($continueOffset); |
||||
80 | } |
||||
81 | |||||
82 | if (!isset($arrayResp['query']) || empty($arrayResp['query']['search'])) { |
||||
83 | return []; |
||||
84 | } |
||||
85 | $results = $arrayResp['query']['search']; |
||||
86 | |||||
87 | $titles = []; |
||||
88 | foreach ($results as $res) { |
||||
89 | if (!empty($res['title'])) { |
||||
90 | $titles[] = trim((string)$res['title']); // trim utile ? |
||||
91 | } |
||||
92 | } |
||||
93 | |||||
94 | if (isset($this->options[self::OPTION_REVERSE]) && $this->options[self::OPTION_REVERSE] === true) { |
||||
95 | krsort($titles); |
||||
96 | } |
||||
97 | |||||
98 | return $titles; |
||||
99 | } |
||||
100 | |||||
101 | /** |
||||
102 | * todo Wiki API ? |
||||
103 | * @throws ConfigException |
||||
104 | * @throws Exception |
||||
105 | */ |
||||
106 | protected function httpRequest(): array |
||||
107 | { |
||||
108 | $url = $this->getURL(); |
||||
109 | if ($url === '' || $url === '0') { |
||||
110 | throw new ConfigException('CirrusSearch null URL'); |
||||
111 | } |
||||
112 | |||||
113 | // improve with curl options ? |
||||
114 | $response = $this->client->get($url); // TODO refac with wiki API login |
||||
115 | /** |
||||
116 | * @var $response Response |
||||
117 | */ |
||||
118 | if ($response->getStatusCode() !== 200) { |
||||
119 | throw new Exception( |
||||
120 | 'CirrusSearch error : ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase() |
||||
121 | ); |
||||
122 | } |
||||
123 | $json = $response->getBody()->getContents(); |
||||
124 | if (empty($json)) { |
||||
125 | return []; |
||||
126 | } |
||||
127 | try { |
||||
128 | $array = json_decode((string)$json, true, 512, JSON_THROW_ON_ERROR); |
||||
129 | } catch (Throwable $e) { |
||||
130 | throw new Exception($e->getMessage(), $e->getCode(), $e); |
||||
131 | } |
||||
132 | |||||
133 | return $array; |
||||
134 | } |
||||
135 | |||||
136 | protected function getURL(): string |
||||
137 | { |
||||
138 | if (empty($this->params['srsearch'])) { |
||||
139 | throw new InvalidArgumentException('No "srsearch" argument in params.'); |
||||
140 | } |
||||
141 | |||||
142 | $this->requestParams = array_merge($this->defaultParams, $this->params); |
||||
143 | if ($this->options[self::OPTION_CONTINUE] ?? false) { |
||||
144 | $this->requestParams['sroffset'] = $this->getOffsetFromFile($this->requestParams); |
||||
145 | echo sprintf("Extract offset %s from file \n", $this->requestParams['sroffset']); |
||||
146 | } |
||||
147 | // RFC3986 : space => %20 |
||||
148 | $query = http_build_query($this->requestParams, 'bla', '&', PHP_QUERY_RFC3986); |
||||
149 | |||||
150 | return self::BASE_URL . '?' . $query; |
||||
151 | } |
||||
152 | |||||
153 | protected function getOffsetFromFile(array $allParams): int |
||||
154 | { |
||||
155 | $hash = $this->hashSearchParams($allParams); |
||||
156 | $file = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME); |
||||
157 | if (!file_exists($file)) { |
||||
158 | return 0; |
||||
159 | } |
||||
160 | |||||
161 | return (int)trim(file_get_contents($file)); |
||||
162 | } |
||||
163 | |||||
164 | protected function hashSearchParams(array $params): string |
||||
165 | { |
||||
166 | if (empty($params)) { |
||||
167 | throw new InvalidArgumentException('No search argument in params.'); |
||||
168 | } |
||||
169 | if (isset($params['sroffset'])) { |
||||
170 | unset($params['sroffset']); |
||||
171 | } |
||||
172 | |||||
173 | return md5(implode('', $params)); |
||||
174 | } |
||||
175 | |||||
176 | protected function saveOffsetInFile(int $continueOffset = 0): void |
||||
177 | { |
||||
178 | $hash = $this->hashSearchParams($this->requestParams); |
||||
179 | $offsetFilename = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME); |
||||
180 | |||||
181 | if ($continueOffset === 0 && file_exists($offsetFilename)) { |
||||
182 | @unlink($offsetFilename); |
||||
0 ignored issues
–
show
It seems like you do not handle an error condition for
unlink() . This can introduce security issues, and is generally not recommended.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
If you suppress an error, we recommend checking for the error condition explicitly: // For example instead of
@mkdir($dir);
// Better use
if (@mkdir($dir) === false) {
throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
![]() |
|||||
183 | } else { |
||||
184 | file_put_contents($offsetFilename, $continueOffset); |
||||
185 | } |
||||
186 | } |
||||
187 | } |
||||
188 |