PageRepository::getWikidataItems()   A
last analyzed

Complexity

Conditions 5
Paths 6

Size

Total Lines 17
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 8
nc 6
nop 2
dl 0
loc 17
rs 9.6111
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types = 1);
4
5
namespace App\Repository;
6
7
use App\Exception\BadGatewayException;
8
use App\Model\Page;
9
use App\Model\Project;
10
use App\Model\User;
11
use DateTime;
12
use Doctrine\DBAL\Driver\ResultStatement;
13
use GuzzleHttp\Exception\ClientException;
14
use GuzzleHttp\Exception\ConnectException;
15
use GuzzleHttp\Exception\ServerException;
16
use GuzzleHttp\RequestOptions;
17
use Symfony\Component\HttpFoundation\Response;
18
19
/**
20
 * A PageRepository fetches data about Pages, either singularly or for multiple.
21
 * Despite the name, this does not have a direct correlation with the Pages tool.
22
 * @codeCoverageIgnore
23
 */
24
class PageRepository extends Repository
25
{
26
    /**
27
     * Get metadata about a single page from the API.
28
     * @param Project $project The project to which the page belongs.
29
     * @param string $pageTitle Page title.
30
     * @return string[]|null Array with some of the following keys: pageid, title, missing, displaytitle, url.
31
     *   Returns null if page does not exist.
32
     */
33
    public function getPageInfo(Project $project, string $pageTitle): ?array
34
    {
35
        $info = $this->getPagesInfo($project, [$pageTitle]);
36
        return null !== $info ? array_shift($info) : null;
37
    }
38
39
    /**
40
     * Get metadata about a set of pages from the API.
41
     * @param Project $project The project to which the pages belong.
42
     * @param string[] $pageTitles Array of page titles.
43
     * @return array|null Array keyed by the page names, each element with some of the following keys: pageid,
44
     *   title, missing, displaytitle, url. Returns null if page does not exist.
45
     */
46
    public function getPagesInfo(Project $project, array $pageTitles): ?array
47
    {
48
        $params = [
49
            'prop' => 'info|pageprops',
50
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|displaytitle',
51
            'converttitles' => '',
52
            'titles' => join('|', $pageTitles),
53
            'formatversion' => 2,
54
        ];
55
56
        $res = $this->executeApiRequest($project, $params);
57
        $result = [];
58
        if (isset($res['query']['pages'])) {
59
            foreach ($res['query']['pages'] as $pageInfo) {
60
                $result[$pageInfo['title']] = $pageInfo;
61
            }
62
        } else {
63
            return null;
64
        }
65
        return $result;
66
    }
67
68
    /**
69
     * Get the full page text of a set of pages.
70
     * @param Project $project The project to which the pages belong.
71
     * @param string[] $pageTitles Array of page titles.
72
     * @return string[] Array keyed by the page names, with the page text as the values.
73
     */
74
    public function getPagesWikitext(Project $project, array $pageTitles): array
75
    {
76
        $params = [
77
            'prop' => 'revisions',
78
            'rvprop' => 'content',
79
            'titles' => join('|', $pageTitles),
80
            'formatversion' => 2,
81
        ];
82
        $res = $this->executeApiRequest($project, $params);
83
        $result = [];
84
85
        if (!isset($res['query']['pages'])) {
86
            return [];
87
        }
88
89
        foreach ($res['query']['pages'] as $page) {
90
            if (isset($page['revisions'][0]['content'])) {
91
                $result[$page['title']] = $page['revisions'][0]['content'];
92
            } else {
93
                $result[$page['title']] = '';
94
            }
95
        }
96
97
        return $result;
98
    }
99
100
    /**
101
     * Get revisions of a single page.
102
     * @param Page $page The page.
103
     * @param User|null $user Specify to get only revisions by the given user.
104
     * @param false|int $start
105
     * @param false|int $end
106
     * @return string[] Each member with keys: id, timestamp, length.
107
     */
108
    public function getRevisions(Page $page, ?User $user = null, $start = false, $end = false): array
109
    {
110
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
111
        if ($this->cache->hasItem($cacheKey)) {
112
            return $this->cache->getItem($cacheKey)->get();
113
        }
114
115
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
116
        $result = $stmt->fetchAllAssociative();
117
118
        // Cache and return.
119
        return $this->setCache($cacheKey, $result);
120
    }
121
122
    /**
123
     * Get the statement for a single revision, so that you can iterate row by row.
124
     * @param Page $page The page.
125
     * @param User|null $user Specify to get only revisions by the given user.
126
     * @param ?int $limit Max number of revisions to process.
127
     * @param ?int $numRevisions Number of revisions, if known. This is used solely to determine the
128
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
129
     *   a separate query is ran to get the number of revisions.
130
     * @param false|int $start
131
     * @param false|int $end
132
     * @return ResultStatement
133
     */
134
    public function getRevisionsStmt(
135
        Page $page,
136
        ?User $user = null,
137
        ?int $limit = null,
138
        ?int $numRevisions = null,
139
        $start = false,
140
        $end = false
141
    ): ResultStatement {
142
        $revTable = $this->getTableName(
143
            $page->getProject()->getDatabaseName(),
144
            'revision',
145
            $user ? null : '' // Use 'revision' if there's no user, otherwise default to revision_userindex
146
        );
147
        $commentTable = $page->getProject()->getTableName('comment');
148
        $actorTable = $page->getProject()->getTableName('actor');
149
        $userClause = $user ? "revs.rev_actor = :actorId AND " : "";
150
151
        $limitClause = '';
152
        if (intval($limit) > 0 && isset($numRevisions)) {
153
            $limitClause = "LIMIT $limit";
154
        }
155
156
        $dateConditions = $this->getDateConditions($start, $end, false, 'revs.');
157
158
        $sql = "SELECT * FROM (
159
                    SELECT
160
                        revs.rev_id AS `id`,
161
                        revs.rev_timestamp AS `timestamp`,
162
                        revs.rev_minor_edit AS `minor`,
163
                        revs.rev_len AS `length`,
164
                        (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS `length_change`,
165
                        actor_user AS user_id,
166
                        actor_name AS username,
167
                        comment_text AS `comment`,
168
                        revs.rev_sha1 AS `sha`,
169
                        revs.rev_deleted AS `deleted`
170
                    FROM $revTable AS revs
171
                    LEFT JOIN $actorTable ON revs.rev_actor = actor_id
172
                    LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
173
                    LEFT OUTER JOIN $commentTable ON comment_id = revs.rev_comment_id
174
                    WHERE $userClause revs.rev_page = :pageid $dateConditions
175
                    ORDER BY revs.rev_timestamp DESC
176
                    $limitClause
177
                ) a
178
                ORDER BY `timestamp` ASC";
179
180
        $params = ['pageid' => $page->getId()];
181
        if ($user) {
182
            $params['actorId'] = $user->getActorId($page->getProject());
183
        }
184
185
        return $this->executeProjectsQuery($page->getProject(), $sql, $params);
186
    }
187
188
    /**
189
     * Get a count of the number of revisions of a single page
190
     * @param Page $page The page.
191
     * @param User|null $user Specify to only count revisions by the given user.
192
     * @param false|int $start
193
     * @param false|int $end
194
     * @return int
195
     */
196
    public function getNumRevisions(Page $page, ?User $user = null, $start = false, $end = false): int
197
    {
198
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
199
        if ($this->cache->hasItem($cacheKey)) {
200
            return $this->cache->getItem($cacheKey)->get();
201
        }
202
203
        // In this case revision is faster than revision_userindex if we're not querying by user.
204
        $revTable = $page->getProject()->getTableName(
205
            'revision',
206
            $user && $this->isWMF ? '_userindex' : ''
207
        );
208
        $userClause = $user ? "rev_actor = :actorId AND " : "";
209
210
        $dateConditions = $this->getDateConditions($start, $end);
211
212
        $sql = "SELECT COUNT(*)
213
                FROM $revTable
214
                WHERE $userClause rev_page = :pageid $dateConditions";
215
        $params = ['pageid' => $page->getId()];
216
        if ($user) {
217
            $params['rev_actor'] = $user->getActorId($page->getProject());
218
        }
219
220
        $result = (int)$this->executeProjectsQuery($page->getProject(), $sql, $params)->fetchOne();
221
222
        // Cache and return.
223
        return $this->setCache($cacheKey, $result);
224
    }
225
226
    /**
227
     * Get any CheckWiki errors of a single page
228
     * @param Page $page
229
     * @return array Results from query
230
     */
231
    public function getCheckWikiErrors(Page $page): array
232
    {
233
        // Only support mainspace on Labs installations
234
        if (0 !== $page->getNamespace() || !$this->isWMF) {
235
            return [];
236
        }
237
238
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
239
                FROM s51080__checkwiki_p.cw_error a
240
                JOIN s51080__checkwiki_p.cw_overview_errors b
241
                WHERE a.project = b.project
242
                AND a.project = :dbName
243
                AND a.title = :title
244
                AND a.error = b.id
245
                AND a.ok = 0";
246
247
        // remove _p if present
248
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
249
250
        // Page title without underscores (str_replace just to be sure)
251
        $pageTitle = str_replace('_', ' ', $page->getTitle());
252
253
        $conn = $this->getToolsConnection();
254
        return $conn->executeQuery($sql, [
255
            'dbName' => $dbName,
256
            'title' => $pageTitle,
257
        ])->fetchAllAssociative();
258
    }
259
260
    /**
261
     * Get basic wikidata on the page: label and description.
262
     * @param Page $page
263
     * @return string[][] In the format:
264
     *    [[
265
     *         'term' => string such as 'label',
266
     *         'term_text' => string (value for 'label'),
267
     *     ], ... ]
268
     */
269
    public function getWikidataInfo(Page $page): array
270
    {
271
        if (empty($page->getWikidataId())) {
272
            return [];
273
        }
274
275
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
276
        $lang = $page->getProject()->getLang();
277
        $wdp = 'wikidatawiki_p';
278
279
        $sql = "SELECT wby_name AS term, wbx_text AS term_text
280
                FROM $wdp.wbt_item_terms
281
                JOIN $wdp.wbt_term_in_lang ON wbit_term_in_lang_id = wbtl_id
282
                JOIN $wdp.wbt_type ON wbtl_type_id = wby_id
283
                JOIN $wdp.wbt_text_in_lang ON wbtl_text_in_lang_id = wbxl_id
284
                JOIN $wdp.wbt_text ON wbxl_text_id = wbx_id
285
                WHERE wbit_item_id = :wikidataId
286
                AND wby_name IN ('label', 'description')
287
                AND wbxl_language = :lang";
288
289
        return $this->executeProjectsQuery('wikidatawiki', $sql, [
290
            'lang' => $lang,
291
            'wikidataId' => $wikidataId,
292
        ])->fetchAllAssociative();
293
    }
294
295
    /**
296
     * Get or count all wikidata items for the given page,
297
     *     not just languages of sister projects
298
     * @param Page $page
299
     * @param bool $count Set to true to get only a COUNT
300
     * @return string[]|int Records as returend by the DB,
301
     *                      or raw COUNT of the records.
302
     */
303
    public function getWikidataItems(Page $page, bool $count = false)
304
    {
305
        if (!$page->getWikidataId()) {
306
            return $count ? 0 : [];
307
        }
308
309
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
310
311
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
312
                FROM wikidatawiki_p.wb_items_per_site
313
                WHERE ips_item_id = :wikidataId";
314
315
        $result = $this->executeProjectsQuery('wikidatawiki', $sql, [
316
            'wikidataId' => $wikidataId,
317
        ])->fetchAllAssociative();
318
319
        return $count ? (int) $result[0]['count'] : $result;
320
    }
321
322
    /**
323
     * Get number of in and outgoing links and redirects to the given page.
324
     * @param Page $page
325
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
326
     *                  'links_in_count' and 'redirects_count'
327
     */
328
    public function countLinksAndRedirects(Page $page): array
329
    {
330
        $externalLinksTable = $page->getProject()->getTableName('externallinks');
331
        $pageLinksTable = $page->getProject()->getTableName('pagelinks');
332
        $linkTargetTable = $page->getProject()->getTableName('linktarget');
333
        $redirectTable = $page->getProject()->getTableName('redirect');
334
335
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
336
                FROM $externalLinksTable WHERE el_from = :id
337
                UNION
338
                SELECT COUNT(*) AS value, 'links_out' AS type
339
                FROM $pageLinksTable WHERE pl_from = :id
340
                UNION
341
                SELECT COUNT(*) AS value, 'links_in' AS type
342
                FROM $pageLinksTable
343
                JOIN $linkTargetTable ON lt_id = pl_target_id
344
                WHERE lt_namespace = :namespace AND lt_title = :title
345
                UNION
346
                SELECT COUNT(*) AS value, 'redirects' AS type
347
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
348
349
        $params = [
350
            'id' => $page->getId(),
351
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
352
            'namespace' => $page->getNamespace(),
353
        ];
354
355
        $res = $this->executeProjectsQuery($page->getProject(), $sql, $params);
356
        $data = [];
357
358
        // Transform to associative array by 'type'
359
        foreach ($res as $row) {
360
            $data[$row['type'] . '_count'] = (int)$row['value'];
361
        }
362
363
        return $data;
364
    }
365
366
    /**
367
     * Count wikidata items for the given page, not just languages of sister projects
368
     * @param Page $page
369
     * @return int Number of records.
370
     */
371
    public function countWikidataItems(Page $page): int
372
    {
373
        return $this->getWikidataItems($page, true);
374
    }
375
376
    /**
377
     * Get page views for the given page and timeframe.
378
     * @fixme use Symfony Guzzle package.
379
     * @param Page $page
380
     * @param string|DateTime $start In the format YYYYMMDD
381
     * @param string|DateTime $end In the format YYYYMMDD
382
     * @return string[][][]
383
     * @throws BadGatewayException
384
     */
385
    public function getPageviews(Page $page, $start, $end): array
386
    {
387
        // Pull from cache for each call during the same request.
388
        // FIXME: This is fine for now as we only fetch pageviews for one page at a time,
389
        //   but if that ever changes we'll need to use APCu cache or otherwise respect $page, $start and $end.
390
        //   Better of course would be to move to a Symfony CachingHttpClient instead of Guzzle across the board.
391
        static $pageviews;
392
        if (isset($pageviews)) {
393
            return $pageviews;
394
        }
395
396
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
397
398
        if ($start instanceof DateTime) {
399
            $start = $start->format('Ymd');
400
        } else {
401
            $start = (new DateTime($start))->format('Ymd');
402
        }
403
        if ($end instanceof DateTime) {
404
            $end = $end->format('Ymd');
405
        } else {
406
            $end = (new DateTime($end))->format('Ymd');
407
        }
408
409
        $project = $page->getProject()->getDomain();
410
411
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
412
            "$project/all-access/user/$title/daily/$start/$end";
413
414
        try {
415
            $res = $this->guzzle->request('GET', $url, [
416
                // Five seconds should be plenty...
417
                RequestOptions::CONNECT_TIMEOUT => 5,
418
            ]);
419
            $pageviews = json_decode($res->getBody()->getContents(), true);
420
            return $pageviews;
421
        } catch (ServerException|ConnectException $e) {
422
            throw new BadGatewayException('api-error-wikimedia', ['Pageviews'], $e);
423
        }
424
    }
425
426
    /**
427
     * Get the full HTML content of the the page.
428
     * @param Page $page
429
     * @param int|null $revId What revision to query for.
430
     * @return string
431
     * @throws BadGatewayException
432
     */
433
    public function getHTMLContent(Page $page, ?int $revId = null): string
434
    {
435
        if ($this->isWMF) {
436
            $domain = $page->getProject()->getDomain();
437
            $url = "https://$domain/api/rest_v1/page/html/" . urlencode(str_replace(' ', '_', $page->getTitle()));
438
            if (null !== $revId) {
439
                $url .= "/$revId";
440
            }
441
        } else {
442
            $url = $page->getUrl();
443
            if (null !== $revId) {
444
                $url .= "?oldid=$revId";
445
            }
446
        }
447
448
        try {
449
            return $this->guzzle->request('GET', $url)
450
                ->getBody()
451
                ->getContents();
452
        } catch (ServerException $e) {
453
            throw new BadGatewayException('api-error-wikimedia', ['Wikimedia REST'], $e);
454
        } catch (ClientException $e) {
455
            if ($page->exists() && Response::HTTP_NOT_FOUND === $e->getCode()) {
456
                // Sometimes the REST API throws 404s when the page does in fact exist.
457
                throw new BadGatewayException('api-error-wikimedia', ['Wikimedia REST'], $e);
458
            }
459
            throw $e;
460
        }
461
    }
462
463
    /**
464
     * Get the ID of the revision of a page at the time of the given DateTime.
465
     * @param Page $page
466
     * @param DateTime $date
467
     * @return int
468
     */
469
    public function getRevisionIdAtDate(Page $page, DateTime $date): int
470
    {
471
        $revisionTable = $page->getProject()->getTableName('revision');
472
        $pageId = $page->getId();
473
        $datestamp = $date->format('YmdHis');
474
        $sql = "SELECT MAX(rev_id)
475
                FROM $revisionTable
476
                WHERE rev_timestamp <= $datestamp
477
                AND rev_page = $pageId LIMIT 1;";
478
        $resultQuery = $this->getProjectsConnection($page->getProject())
479
            ->executeQuery($sql);
480
        return (int)$resultQuery->fetchOne();
481
    }
482
483
    /**
484
     * Get HTML display titles of a set of pages (or the normal title if there's no display title).
485
     * This will send t/50 API requests where t is the number of titles supplied.
486
     * @param Project $project The project.
487
     * @param string[] $pageTitles The titles to fetch.
488
     * @return string[] Keys are the original supplied title, and values are the display titles.
489
     * @static
490
     */
491
    public function displayTitles(Project $project, array $pageTitles): array
492
    {
493
        $displayTitles = [];
494
        $numPages = count($pageTitles);
495
496
        for ($n = 0; $n < $numPages; $n += 50) {
497
            $titleSlice = array_slice($pageTitles, $n, 50);
498
            $res = $this->guzzle->request('GET', $project->getApiUrl(), ['query' => [
499
                'action' => 'query',
500
                'prop' => 'info|pageprops',
501
                'inprop' => 'displaytitle',
502
                'titles' => join('|', $titleSlice),
503
                'format' => 'json',
504
            ]]);
505
            $result = json_decode($res->getBody()->getContents(), true);
506
507
            // Extract normalization info.
508
            $normalized = [];
509
            if (isset($result['query']['normalized'])) {
510
                array_map(
511
                    function ($e) use (&$normalized): void {
512
                        $normalized[$e['to']] = $e['from'];
513
                    },
514
                    $result['query']['normalized']
515
                );
516
            }
517
518
            // Match up the normalized titles with the display titles and the original titles.
519
            foreach ($result['query']['pages'] as $pageInfo) {
520
                $displayTitle = $pageInfo['pageprops']['displaytitle'] ?? $pageInfo['title'];
521
                $origTitle = $normalized[$pageInfo['title']] ?? $pageInfo['title'];
522
                $displayTitles[$origTitle] = $displayTitle;
523
            }
524
        }
525
526
        return $displayTitles;
527
    }
528
}
529