Dispositif /
Wikibot
| 1 | <?php |
||||
| 2 | /* |
||||
| 3 | * This file is part of dispositif/wikibot application (@github) |
||||
| 4 | * 2019-2023 © Philippe M./Irønie <[email protected]> |
||||
| 5 | * For the full copyright and MIT license information, view the license file. |
||||
| 6 | */ |
||||
| 7 | |||||
| 8 | declare(strict_types=1); |
||||
| 9 | |||||
| 10 | |||||
| 11 | namespace App\Infrastructure; |
||||
| 12 | |||||
| 13 | use App\Application\InfrastructurePorts\HttpClientInterface; |
||||
| 14 | use App\Application\InfrastructurePorts\PageListForAppInterface; |
||||
| 15 | use App\Domain\Exceptions\ConfigException; |
||||
| 16 | use App\Domain\InfrastructurePorts\PageListInterface; |
||||
| 17 | use Exception; |
||||
| 18 | use GuzzleHttp\Psr7\Response; |
||||
| 19 | use InvalidArgumentException; |
||||
| 20 | use Throwable; |
||||
| 21 | |||||
| 22 | /** |
||||
| 23 | * TODO injecter API session sinon limité à 500 results ! |
||||
| 24 | * https://www.mediawiki.org/wiki/Help:CirrusSearch |
||||
| 25 | * https://fr.wikipedia.org/w/api.php?action=help&modules=query%2Bsearch |
||||
| 26 | * https://www.mediawiki.org/wiki/Help:CirrusSearch#Insource |
||||
| 27 | * raw https://fr.wikipedia.org/w/api.php?action=query&list=search&srsearch=%22https://books.google%22%20insource:/\%3Cref\%3Ehttps\:\/\/books\.google/&formatversion=2&format=json |
||||
| 28 | * Dirty. |
||||
| 29 | * Class CirrusSearch |
||||
| 30 | */ |
||||
| 31 | class CirrusSearch implements PageListInterface, PageListForAppInterface |
||||
| 32 | { |
||||
| 33 | public const OPTION_CONTINUE = 'continue'; |
||||
| 34 | public const OPTION_REVERSE = 'reverse'; |
||||
| 35 | public const OPTION_APILOGIN = 'apilogin'; |
||||
| 36 | public const SRSORT_NONE = 'none'; |
||||
| 37 | public const SRSORT_RANDOM = 'random'; |
||||
| 38 | public const SRSORT_LAST_EDIT_DESC = 'last_edit_desc'; |
||||
| 39 | public const SRQIPROFILE_POPULAR_INCLINKS_PV = 'popular_inclinks_pv'; // nombre de vues de la page :) |
||||
| 40 | public const SRQIPROFILE_DEFAULT = 'engine_autoselect'; |
||||
| 41 | |||||
| 42 | protected const BASE_URL = 'https://fr.wikipedia.org/w/api.php'; // todo move config |
||||
| 43 | protected const CONTINUE_OFFSET_FILENAME = __DIR__ . '/../../resources/cirrusSearch-{HASH}.txt'; // todo move config |
||||
| 44 | |||||
| 45 | protected array $requestParams = []; |
||||
| 46 | protected array $defaultParams |
||||
| 47 | = [ |
||||
| 48 | 'action' => 'query', |
||||
| 49 | 'list' => 'search', |
||||
| 50 | 'formatversion' => '2', |
||||
| 51 | 'format' => 'json', |
||||
| 52 | 'srnamespace' => 0, |
||||
| 53 | 'srlimit' => '500', // max 500 péon, 5000 bot/admin |
||||
| 54 | 'srprop' => 'size|wordcount|timestamp', // default 'size|wordcount|timestamp|snippet' |
||||
| 55 | ]; |
||||
| 56 | protected readonly HttpClientInterface $client; |
||||
| 57 | |||||
| 58 | /** |
||||
| 59 | * $options : "continue" => true for continue search |
||||
| 60 | */ |
||||
| 61 | public function __construct(protected readonly array $params, protected ?array $options = []) |
||||
| 62 | { |
||||
| 63 | $this->client = ServiceFactory::getHttpClient(); |
||||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||||
| 64 | } |
||||
| 65 | |||||
| 66 | /** |
||||
| 67 | * @return array |
||||
| 68 | * @throws ConfigException |
||||
| 69 | */ |
||||
| 70 | public function getPageTitles(): array |
||||
| 71 | { |
||||
| 72 | $arrayResp = $this->httpRequest(); |
||||
| 73 | |||||
| 74 | if ($this->options[self::OPTION_CONTINUE] ?? false) { |
||||
| 75 | $continueOffset = 0; |
||||
| 76 | if (!empty($arrayResp['continue']['sroffset'])) { |
||||
| 77 | $continueOffset = (int)$arrayResp['continue']['sroffset']; |
||||
| 78 | } |
||||
| 79 | $this->saveOffsetInFile($continueOffset); |
||||
| 80 | } |
||||
| 81 | |||||
| 82 | if (!isset($arrayResp['query']) || empty($arrayResp['query']['search'])) { |
||||
| 83 | return []; |
||||
| 84 | } |
||||
| 85 | $results = $arrayResp['query']['search']; |
||||
| 86 | |||||
| 87 | $titles = []; |
||||
| 88 | foreach ($results as $res) { |
||||
| 89 | if (!empty($res['title'])) { |
||||
| 90 | $titles[] = trim((string)$res['title']); // trim utile ? |
||||
| 91 | } |
||||
| 92 | } |
||||
| 93 | |||||
| 94 | if (isset($this->options[self::OPTION_REVERSE]) && $this->options[self::OPTION_REVERSE] === true) { |
||||
| 95 | krsort($titles); |
||||
| 96 | } |
||||
| 97 | |||||
| 98 | return $titles; |
||||
| 99 | } |
||||
| 100 | |||||
| 101 | /** |
||||
| 102 | * todo Wiki API ? |
||||
| 103 | * @throws ConfigException |
||||
| 104 | * @throws Exception |
||||
| 105 | */ |
||||
| 106 | protected function httpRequest(): array |
||||
| 107 | { |
||||
| 108 | $url = $this->getURL(); |
||||
| 109 | if ($url === '' || $url === '0') { |
||||
| 110 | throw new ConfigException('CirrusSearch null URL'); |
||||
| 111 | } |
||||
| 112 | |||||
| 113 | // improve with curl options ? |
||||
| 114 | $response = $this->client->get($url); // TODO refac with wiki API login |
||||
| 115 | /** |
||||
| 116 | * @var $response Response |
||||
| 117 | */ |
||||
| 118 | if ($response->getStatusCode() !== 200) { |
||||
| 119 | throw new Exception( |
||||
| 120 | 'CirrusSearch error : ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase() |
||||
| 121 | ); |
||||
| 122 | } |
||||
| 123 | $json = $response->getBody()->getContents(); |
||||
| 124 | if (empty($json)) { |
||||
| 125 | return []; |
||||
| 126 | } |
||||
| 127 | try { |
||||
| 128 | $array = json_decode((string)$json, true, 512, JSON_THROW_ON_ERROR); |
||||
| 129 | } catch (Throwable $e) { |
||||
| 130 | throw new Exception($e->getMessage(), $e->getCode(), $e); |
||||
| 131 | } |
||||
| 132 | |||||
| 133 | return $array; |
||||
| 134 | } |
||||
| 135 | |||||
| 136 | protected function getURL(): string |
||||
| 137 | { |
||||
| 138 | if (empty($this->params['srsearch'])) { |
||||
| 139 | throw new InvalidArgumentException('No "srsearch" argument in params.'); |
||||
| 140 | } |
||||
| 141 | |||||
| 142 | $this->requestParams = array_merge($this->defaultParams, $this->params); |
||||
| 143 | if ($this->options[self::OPTION_CONTINUE] ?? false) { |
||||
| 144 | $this->requestParams['sroffset'] = $this->getOffsetFromFile($this->requestParams); |
||||
| 145 | echo sprintf("Extract offset %s from file \n", $this->requestParams['sroffset']); |
||||
| 146 | } |
||||
| 147 | // RFC3986 : space => %20 |
||||
| 148 | $query = http_build_query($this->requestParams, 'bla', '&', PHP_QUERY_RFC3986); |
||||
| 149 | |||||
| 150 | return self::BASE_URL . '?' . $query; |
||||
| 151 | } |
||||
| 152 | |||||
| 153 | protected function getOffsetFromFile(array $allParams): int |
||||
| 154 | { |
||||
| 155 | $hash = $this->hashSearchParams($allParams); |
||||
| 156 | $file = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME); |
||||
| 157 | if (!file_exists($file)) { |
||||
| 158 | return 0; |
||||
| 159 | } |
||||
| 160 | |||||
| 161 | return (int)trim(file_get_contents($file)); |
||||
| 162 | } |
||||
| 163 | |||||
| 164 | protected function hashSearchParams(array $params): string |
||||
| 165 | { |
||||
| 166 | if (empty($params)) { |
||||
| 167 | throw new InvalidArgumentException('No search argument in params.'); |
||||
| 168 | } |
||||
| 169 | if (isset($params['sroffset'])) { |
||||
| 170 | unset($params['sroffset']); |
||||
| 171 | } |
||||
| 172 | |||||
| 173 | return md5(implode('', $params)); |
||||
| 174 | } |
||||
| 175 | |||||
| 176 | protected function saveOffsetInFile(int $continueOffset = 0): void |
||||
| 177 | { |
||||
| 178 | $hash = $this->hashSearchParams($this->requestParams); |
||||
| 179 | $offsetFilename = str_replace('{HASH}', $hash, self::CONTINUE_OFFSET_FILENAME); |
||||
| 180 | |||||
| 181 | if ($continueOffset === 0 && file_exists($offsetFilename)) { |
||||
| 182 | @unlink($offsetFilename); |
||||
|
0 ignored issues
–
show
It seems like you do not handle an error condition for
unlink(). This can introduce security issues, and is generally not recommended.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
If you suppress an error, we recommend checking for the error condition explicitly: // For example instead of
@mkdir($dir);
// Better use
if (@mkdir($dir) === false) {
throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
|
|||||
| 183 | } else { |
||||
| 184 | file_put_contents($offsetFilename, $continueOffset); |
||||
| 185 | } |
||||
| 186 | } |
||||
| 187 | } |
||||
| 188 |