WstatImport::getPageTitles()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 4
ccs 0
cts 3
cp 0
crap 2
rs 10
c 1
b 0
f 0
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Infrastructure;
11
12
use App\Application\InfrastructurePorts\PageListForAppInterface;
13
use App\Domain\InfrastructurePorts\PageListInterface;
14
use Exception;
15
use GuzzleHttp\Client;
16
17
/**
18
 * @unused
19
 * Data import from https://wstat.fr (frwiki daily dump parsing).
20
 * https://wstat.fr/template/index.php?title=Ouvrage&query=inclusions&param=isbn&start=50000&limit=50&format=json
21
 * Class WstatImport.
22
 */
23
class WstatImport implements PageListInterface, PageListForAppInterface
24
{
25
    final public const MAX_IMPORT = 50000;
26
27
    private array $params = [];
28
29
    private ?int $max = 100;
30
31
    public function __construct(private readonly Client $client, ?array $params = null, ?int $max = 500)
32
    {
33
        $this->max = min(self::MAX_IMPORT, $max);
34
35
        //example
36
        // "nom de page" : https://wstat.fr/template/index.php?title=Ouvrage&query=inclusions-title&start=105000&limit=5000
37
        // "modèle complet" : https://wstat.fr/template/index.php?title=Ouvrage&query=inclusions&start=105000&limit=5000
38
        if (!$params) {
39
            $params = [
40
                'title' => 'Ouvrage',
41
                'query' => 'inclusions-title',
42
                //                'param' => 'isbn',
43
                'start' => 50000,
44
                'limit' => 5000,
45
            ];
46
        }
47
        $this->params = $params;
48
    }
49
50
    public function getUrl(): string
51
    {
52
        $this->params['format'] = 'json';
53
        // todo verify http_build_query() enc_type parameter
54
        return 'https://wstat.fr/template/index.php?'.http_build_query($this->params);
55
    }
56
57
    /**
58
     * @return array [ ['title' => ..., 'template' => ...] ]
59
     * @throws Exception
60
     */
61
    public function getData(): array
62
    {
63
        $data = [];
64
        while (true) {
65
            $json = $this->import($this->getUrl());
66
            $raw = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
67
            if (empty($raw)) {
68
                return [];
69
            }
70
            $data = array_merge($data, $this->parsingWstatData($raw));
71
            echo count($data)." titles\n";
72
            if ($this->max <= 0) {
73
                break;
74
            }
75
76
            // next page initialisation
77
            $this->params['start'] = ((int) $this->params['start'] + $this->params['limit']);
78
            sleep(3);
79
        }
80
81
        return $data;
82
    }
83
84
    /**
85
     * Explode raw string.
86
     *
87
     *
88
     * @return array [['title' => ..., 'template' => ...]]
89
     */
90
    private function parsingWstatData(array $raw): array
91
    {
92
        // Generator ?
93
        // Alexandre S. Giffard|{{Ouvrage|langue=|auteur1=|prénom...
94
        $data = [];
95
        foreach ($raw as $line) {
96
            // end of page ?
97
            if ('<!-- + -->' === $line) {
98
                continue;
99
            }
100
            $this->max -= 1;
101
102
            // validate and explode wstat data
103
            $pos = mb_strpos((string) $line, '|', 0);
104
            if (false === $pos || 0 === $pos) {
105
                continue;
106
            }
107
            $title = trim(mb_substr((string) $line, 0, $pos));
108
            $template = trim(mb_substr((string) $line, $pos + 1));
109
            $data[] = ['title' => $title, 'template' => $template];
110
        }
111
112
        return $data;
113
    }
114
115
    /**
116
     *
117
     * @return string
118
     * @throws Exception
119
     */
120
    private function import(string $url)
121
    {
122
        $response = $this->client->get($url);
123
        if (200 !== $response->getStatusCode()) {
124
            throw new Exception(
125
                sprintf('Error code: %s reason: %s', $response->getStatusCode(), $response->getReasonPhrase())
126
            );
127
        }
128
129
        return $response->getBody()->getContents();
130
    }
131
132
    public function getPageTitles(): array
133
    {
134
        // TODO: Implement getPageTitles() method.
135
        return [];
136
    }
137
}
138