Passed
Push — main ( f3415e...33c9b9 )
by MusikAnimal
04:41
created

PageRepository::countWikidataItems()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
declare(strict_types = 1);
7
8
namespace App\Repository;
9
10
use App\Model\Page;
11
use App\Model\Project;
12
use App\Model\User;
13
use DateTime;
14
use Doctrine\DBAL\Driver\ResultStatement;
15
use GuzzleHttp;
16
17
/**
18
 * A PageRepository fetches data about Pages, either singularly or for multiple.
19
 * Despite the name, this does not have a direct correlation with the Pages tool.
20
 * @codeCoverageIgnore
21
 */
22
class PageRepository extends Repository
23
{
24
    /**
25
     * Get metadata about a single page from the API.
26
     * @param Project $project The project to which the page belongs.
27
     * @param string $pageTitle Page title.
28
     * @return string[]|null Array with some of the following keys: pageid, title, missing, displaytitle, url.
29
     *   Returns null if page does not exist.
30
     */
31
    public function getPageInfo(Project $project, string $pageTitle): ?array
32
    {
33
        $info = $this->getPagesInfo($project, [$pageTitle]);
34
        return null !== $info ? array_shift($info) : null;
35
    }
36
37
    /**
38
     * Get metadata about a set of pages from the API.
39
     * @param Project $project The project to which the pages belong.
40
     * @param string[] $pageTitles Array of page titles.
41
     * @return string[]|null Array keyed by the page names, each element with some of the following keys: pageid,
42
     *   title, missing, displaytitle, url. Returns null if page does not exist.
43
     */
44
    public function getPagesInfo(Project $project, array $pageTitles): ?array
45
    {
46
        $params = [
47
            'prop' => 'info|pageprops',
48
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|displaytitle',
49
            'converttitles' => '',
50
            'titles' => join('|', $pageTitles),
51
            'formatversion' => 2,
52
        ];
53
54
        $res = $this->executeApiRequest($project, $params);
55
        $result = [];
56
        if (isset($res['query']['pages'])) {
57
            foreach ($res['query']['pages'] as $pageInfo) {
58
                $result[$pageInfo['title']] = $pageInfo;
59
            }
60
        } else {
61
            return null;
62
        }
63
        return $result;
64
    }
65
66
    /**
67
     * Get the full page text of a set of pages.
68
     * @param Project $project The project to which the pages belong.
69
     * @param string[] $pageTitles Array of page titles.
70
     * @return string[] Array keyed by the page names, with the page text as the values.
71
     */
72
    public function getPagesWikitext(Project $project, array $pageTitles): array
73
    {
74
        $params = [
75
            'prop' => 'revisions',
76
            'rvprop' => 'content',
77
            'titles' => join('|', $pageTitles),
78
            'formatversion' => 2,
79
        ];
80
        $res = $this->executeApiRequest($project, $params);
81
        $result = [];
82
83
        if (!isset($res['query']['pages'])) {
84
            return [];
85
        }
86
87
        foreach ($res['query']['pages'] as $page) {
88
            if (isset($page['revisions'][0]['content'])) {
89
                $result[$page['title']] = $page['revisions'][0]['content'];
90
            } else {
91
                $result[$page['title']] = '';
92
            }
93
        }
94
95
        return $result;
96
    }
97
98
    /**
99
     * Get revisions of a single page.
100
     * @param Page $page The page.
101
     * @param User|null $user Specify to get only revisions by the given user.
102
     * @param false|int $start
103
     * @param false|int $end
104
     * @return string[] Each member with keys: id, timestamp, length.
105
     */
106
    public function getRevisions(Page $page, ?User $user = null, $start = false, $end = false): array
107
    {
108
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
109
        if ($this->cache->hasItem($cacheKey)) {
110
            return $this->cache->getItem($cacheKey)->get();
111
        }
112
113
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
114
        $result = $stmt->fetchAllAssociative();
115
116
        // Cache and return.
117
        return $this->setCache($cacheKey, $result);
118
    }
119
120
    /**
121
     * Get the statement for a single revision, so that you can iterate row by row.
122
     * @param Page $page The page.
123
     * @param User|null $user Specify to get only revisions by the given user.
124
     * @param ?int $limit Max number of revisions to process.
125
     * @param ?int $numRevisions Number of revisions, if known. This is used solely to determine the
126
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
127
     *   a separate query is ran to get the number of revisions.
128
     * @param false|int $start
129
     * @param false|int $end
130
     * @return ResultStatement
131
     */
132
    public function getRevisionsStmt(
133
        Page $page,
134
        ?User $user = null,
135
        ?int $limit = null,
136
        ?int $numRevisions = null,
137
        $start = false,
138
        $end = false
139
    ): ResultStatement {
140
        $revTable = $this->getTableName(
141
            $page->getProject()->getDatabaseName(),
142
            'revision',
143
            $user ? null : '' // Use 'revision' if there's no user, otherwise default to revision_userindex
144
        );
145
        $commentTable = $page->getProject()->getTableName('comment');
146
        $actorTable = $page->getProject()->getTableName('actor');
147
        $userClause = $user ? "revs.rev_actor = :actorId AND " : "";
148
149
        $limitClause = '';
150
        if (intval($limit) > 0 && isset($numRevisions)) {
151
            $limitClause = "LIMIT $limit";
152
        }
153
154
        $dateConditions = $this->getDateConditions($start, $end, false, 'revs.');
155
156
        $sql = "SELECT * FROM (
157
                    SELECT
158
                        revs.rev_id AS id,
159
                        revs.rev_timestamp AS timestamp,
160
                        revs.rev_minor_edit AS minor,
161
                        revs.rev_len AS length,
162
                        (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
163
                        actor_user AS user_id,
164
                        actor_name AS username,
165
                        comment_text AS `comment`,
166
                        revs.rev_sha1 AS sha
167
                    FROM $revTable AS revs
168
                    LEFT JOIN $actorTable ON revs.rev_actor = actor_id
169
                    LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
170
                    LEFT OUTER JOIN $commentTable ON comment_id = revs.rev_comment_id
171
                    WHERE $userClause revs.rev_page = :pageid $dateConditions
172
                    ORDER BY revs.rev_timestamp DESC
173
                    $limitClause
174
                ) a
175
                ORDER BY timestamp ASC";
176
177
        $params = ['pageid' => $page->getId()];
178
        if ($user) {
179
            $params['actorId'] = $user->getActorId($page->getProject());
180
        }
181
182
        return $this->executeProjectsQuery($page->getProject(), $sql, $params);
183
    }
184
185
    /**
186
     * Get a count of the number of revisions of a single page
187
     * @param Page $page The page.
188
     * @param User|null $user Specify to only count revisions by the given user.
189
     * @param false|int $start
190
     * @param false|int $end
191
     * @return int
192
     */
193
    public function getNumRevisions(Page $page, ?User $user = null, $start = false, $end = false): int
194
    {
195
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
196
        if ($this->cache->hasItem($cacheKey)) {
197
            return $this->cache->getItem($cacheKey)->get();
198
        }
199
200
        // In this case revision is faster than revision_userindex if we're not querying by user.
201
        $revTable = $page->getProject()->getTableName(
202
            'revision',
203
            $user && $this->isLabs() ? '_userindex' : ''
204
        );
205
        $userClause = $user ? "rev_actor = :actorId AND " : "";
206
207
        $dateConditions = $this->getDateConditions($start, $end);
208
209
        $sql = "SELECT COUNT(*)
210
                FROM $revTable
211
                WHERE $userClause rev_page = :pageid $dateConditions";
212
        $params = ['pageid' => $page->getId()];
213
        if ($user) {
214
            $params['rev_actor'] = $user->getActorId($page->getProject());
215
        }
216
217
        $result = (int)$this->executeProjectsQuery($page->getProject(), $sql, $params)->fetchOne();
218
219
        // Cache and return.
220
        return $this->setCache($cacheKey, $result);
221
    }
222
223
    /**
224
     * Get any CheckWiki errors of a single page
225
     * @param Page $page
226
     * @return array Results from query
227
     */
228
    public function getCheckWikiErrors(Page $page): array
229
    {
230
        // Only support mainspace on Labs installations
231
        if (0 !== $page->getNamespace() || !$this->isLabs()) {
232
            return [];
233
        }
234
235
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
236
                FROM s51080__checkwiki_p.cw_error a
237
                JOIN s51080__checkwiki_p.cw_overview_errors b
238
                WHERE a.project = b.project
239
                AND a.project = :dbName
240
                AND a.title = :title
241
                AND a.error = b.id
242
                AND a.ok = 0";
243
244
        // remove _p if present
245
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
246
247
        // Page title without underscores (str_replace just to be sure)
248
        $pageTitle = str_replace('_', ' ', $page->getTitle());
249
250
        $conn = $this->getToolsConnection();
251
        return $conn->executeQuery($sql, [
252
            'dbName' => $dbName,
253
            'title' => $pageTitle,
254
        ])->fetchAllAssociative();
255
    }
256
257
    /**
258
     * Get basic wikidata on the page: label and description.
259
     * @param Page $page
260
     * @return string[] In the format:
261
     *    [[
262
     *         'term' => string such as 'label',
263
     *         'term_text' => string (value for 'label'),
264
     *     ], ... ]
265
     */
266
    public function getWikidataInfo(Page $page): array
267
    {
268
        if (empty($page->getWikidataId())) {
269
            return [];
270
        }
271
272
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
273
        $lang = $page->getProject()->getLang();
274
        $wdp = 'wikidatawiki_p';
275
276
        $sql = "SELECT wby_name AS term, wbx_text AS term_text
277
                FROM $wdp.wbt_item_terms
278
                JOIN $wdp.wbt_term_in_lang ON wbit_term_in_lang_id = wbtl_id
279
                JOIN $wdp.wbt_type ON wbtl_type_id = wby_id
280
                JOIN $wdp.wbt_text_in_lang ON wbtl_text_in_lang_id = wbxl_id
281
                JOIN $wdp.wbt_text ON wbxl_text_id = wbx_id
282
                WHERE wbit_item_id = :wikidataId
283
                AND wby_name IN ('label', 'description')
284
                AND wbxl_language = :lang";
285
286
        return $this->executeProjectsQuery('wikidatawiki', $sql, [
287
            'lang' => $lang,
288
            'wikidataId' => $wikidataId,
289
        ])->fetchAllAssociative();
290
    }
291
292
    /**
293
     * Get or count all wikidata items for the given page,
294
     *     not just languages of sister projects
295
     * @param Page $page
296
     * @param bool $count Set to true to get only a COUNT
297
     * @return string[]|int Records as returend by the DB,
298
     *                      or raw COUNT of the records.
299
     */
300
    public function getWikidataItems(Page $page, bool $count = false)
301
    {
302
        if (!$page->getWikidataId()) {
303
            return $count ? 0 : [];
304
        }
305
306
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
307
308
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
309
                FROM wikidatawiki_p.wb_items_per_site
310
                WHERE ips_item_id = :wikidataId";
311
312
        $result = $this->executeProjectsQuery('wikidatawiki', $sql, [
313
            'wikidataId' => $wikidataId,
314
        ])->fetchAllAssociative();
315
316
        return $count ? (int) $result[0]['count'] : $result;
317
    }
318
319
    /**
320
     * Get number of in and outgoing links and redirects to the given page.
321
     * @param Page $page
322
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
323
     *                  'links_in_count' and 'redirects_count'
324
     */
325
    public function countLinksAndRedirects(Page $page): array
326
    {
327
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
328
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
329
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
330
331
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
332
                FROM $externalLinksTable WHERE el_from = :id
333
                UNION
334
                SELECT COUNT(*) AS value, 'links_out' AS type
335
                FROM $pageLinksTable WHERE pl_from = :id
336
                UNION
337
                SELECT COUNT(*) AS value, 'links_in' AS type
338
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
339
                UNION
340
                SELECT COUNT(*) AS value, 'redirects' AS type
341
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
342
343
        $params = [
344
            'id' => $page->getId(),
345
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
346
            'namespace' => $page->getNamespace(),
347
        ];
348
349
        $res = $this->executeProjectsQuery($page->getProject(), $sql, $params);
350
        $data = [];
351
352
        // Transform to associative array by 'type'
353
        foreach ($res as $row) {
354
            $data[$row['type'] . '_count'] = (int)$row['value'];
355
        }
356
357
        return $data;
358
    }
359
360
    /**
361
     * Count wikidata items for the given page, not just languages of sister projects
362
     * @param Page $page
363
     * @return int Number of records.
364
     */
365
    public function countWikidataItems(Page $page): int
366
    {
367
        return $this->getWikidataItems($page, true);
368
    }
369
370
    /**
371
     * Get page views for the given page and timeframe.
372
     * @fixme use Symfony Guzzle package.
373
     * @param Page $page
374
     * @param string|DateTime $start In the format YYYYMMDD
375
     * @param string|DateTime $end In the format YYYYMMDD
376
     * @return string[]
377
     */
378
    public function getPageviews(Page $page, $start, $end): array
379
    {
380
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
381
382
        /** @var GuzzleHttp\Client $client */
383
        $client = $this->container->get('eight_points_guzzle.client.xtools');
384
385
        if ($start instanceof DateTime) {
386
            $start = $start->format('Ymd');
387
        } else {
388
            $start = (new DateTime($start))->format('Ymd');
389
        }
390
        if ($end instanceof DateTime) {
391
            $end = $end->format('Ymd');
392
        } else {
393
            $end = (new DateTime($end))->format('Ymd');
394
        }
395
396
        $project = $page->getProject()->getDomain();
397
398
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
399
            "$project/all-access/user/$title/daily/$start/$end";
400
401
        $res = $client->request('GET', $url);
402
        return json_decode($res->getBody()->getContents(), true);
403
    }
404
405
    /**
406
     * Get the full HTML content of the the page.
407
     * @param Page $page
408
     * @param int $revId What revision to query for.
409
     * @return string
410
     */
411
    public function getHTMLContent(Page $page, ?int $revId = null): string
412
    {
413
        /** @var GuzzleHttp\Client $client */
414
        $client = $this->container->get('eight_points_guzzle.client.xtools');
415
        $url = $page->getUrl();
416
        if (null !== $revId) {
417
            $url .= "?oldid=$revId";
418
        }
419
        return $client->request('GET', $url)
420
            ->getBody()
421
            ->getContents();
422
    }
423
424
    /**
425
     * Get the ID of the revision of a page at the time of the given DateTime.
426
     * @param Page $page
427
     * @param DateTime $date
428
     * @return int
429
     */
430
    public function getRevisionIdAtDate(Page $page, DateTime $date): int
431
    {
432
        $revisionTable = $page->getProject()->getTableName('revision');
433
        $pageId = $page->getId();
434
        $datestamp = $date->format('YmdHis');
435
        $sql = "SELECT MAX(rev_id)
436
                FROM $revisionTable
437
                WHERE rev_timestamp <= $datestamp
438
                AND rev_page = $pageId LIMIT 1;";
439
        $resultQuery = $this->getProjectsConnection($page->getProject())
440
            ->executeQuery($sql);
441
        return (int)$resultQuery->fetchOne();
442
    }
443
444
    /**
445
     * Get HTML display titles of a set of pages (or the normal title if there's no display title).
446
     * This will send t/50 API requests where t is the number of titles supplied.
447
     * @param Project $project The project.
448
     * @param string[] $pageTitles The titles to fetch.
449
     * @return string[] Keys are the original supplied title, and values are the display titles.
450
     * @static
451
     */
452
    public function displayTitles(Project $project, array $pageTitles): array
453
    {
454
        $client = $this->container->get('eight_points_guzzle.client.xtools');
455
456
        $displayTitles = [];
457
        $numPages = count($pageTitles);
458
459
        for ($n = 0; $n < $numPages; $n += 50) {
460
            $titleSlice = array_slice($pageTitles, $n, 50);
461
            $res = $client->request('GET', $project->getApiUrl(), ['query' => [
462
                'action' => 'query',
463
                'prop' => 'info|pageprops',
464
                'inprop' => 'displaytitle',
465
                'titles' => join('|', $titleSlice),
466
                'format' => 'json',
467
            ]]);
468
            $result = json_decode($res->getBody()->getContents(), true);
469
470
            // Extract normalization info.
471
            $normalized = [];
472
            if (isset($result['query']['normalized'])) {
473
                array_map(
474
                    function ($e) use (&$normalized): void {
475
                        $normalized[$e['to']] = $e['from'];
476
                    },
477
                    $result['query']['normalized']
478
                );
479
            }
480
481
            // Match up the normalized titles with the display titles and the original titles.
482
            foreach ($result['query']['pages'] as $pageInfo) {
483
                $displayTitle = $pageInfo['pageprops']['displaytitle'] ?? $pageInfo['title'];
484
                $origTitle = $normalized[$pageInfo['title']] ?? $pageInfo['title'];
485
                $displayTitles[$origTitle] = $displayTitle;
486
            }
487
        }
488
489
        return $displayTitles;
490
    }
491
}
492