Passed
Push — master ( 8b9089...a51f69 )
by MusikAnimal
01:23
created

PagesRepository::getWikidataItems()   B

Complexity

Conditions 5
Paths 6

Size

Total Lines 20
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 20
rs 8.8571
c 0
b 0
f 0
cc 5
eloc 10
nc 6
nop 2
1
<?php
2
/**
3
 * This file contains only the PagesRepository class.
4
 */
5
6
namespace Xtools;
7
8
use DateInterval;
9
use Mediawiki\Api\SimpleRequest;
10
use GuzzleHttp;
11
12
/**
13
 * A PagesRepository fetches data about Pages, either singularly or for multiple.
14
 * @codeCoverageIgnore
15
 */
16
class PagesRepository extends Repository
17
{
18
19
    /**
20
     * Get metadata about a single page from the API.
21
     * @param Project $project The project to which the page belongs.
22
     * @param string $pageTitle Page title.
23
     * @return string[] Array with some of the following keys: pageid, title, missing, displaytitle,
24
     * url.
25
     */
26
    public function getPageInfo(Project $project, $pageTitle)
27
    {
28
        $info = $this->getPagesInfo($project, [$pageTitle]);
29
        return array_shift($info);
30
    }
31
32
    /**
33
     * Get metadata about a set of pages from the API.
34
     * @param Project $project The project to which the pages belong.
35
     * @param string[] $pageTitles Array of page titles.
36
     * @return string[] Array keyed by the page names, each element with some of the
37
     * following keys: pageid, title, missing, displaytitle, url.
38
     */
39
    public function getPagesInfo(Project $project, $pageTitles)
40
    {
41
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
42
        $params = [
43
            'prop' => 'info|pageprops',
44
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
45
            'converttitles' => '',
46
            // 'ellimit' => 20,
47
            // 'elexpandurl' => '',
48
            'titles' => join('|', $pageTitles),
49
            'formatversion' => 2
50
            // 'pageids' => $pageIds // FIXME: allow page IDs
51
        ];
52
53
        $query = new SimpleRequest('query', $params);
54
        $api = $this->getMediawikiApi($project);
55
        $res = $api->getRequest($query);
56
        $result = [];
57
        if (isset($res['query']['pages'])) {
58
            foreach ($res['query']['pages'] as $pageInfo) {
59
                $result[$pageInfo['title']] = $pageInfo;
60
            }
61
        }
62
        return $result;
63
    }
64
65
    /**
66
     * Get the full page text of a set of pages.
67
     * @param Project $project The project to which the pages belong.
68
     * @param string[] $pageTitles Array of page titles.
69
     * @return string[] Array keyed by the page names, with the page text as the values.
70
     */
71
    public function getPagesWikitext(Project $project, $pageTitles)
72
    {
73
        $query = new SimpleRequest('query', [
74
            'prop' => 'revisions',
75
            'rvprop' => 'content',
76
            'titles' => join('|', $pageTitles),
77
            'formatversion' => 2,
78
        ]);
79
        $result = [];
80
81
        $api = $this->getMediawikiApi($project);
82
        $res = $api->getRequest($query);
83
84
        if (!isset($res['query']['pages'])) {
85
            return [];
86
        }
87
88
        foreach ($res['query']['pages'] as $page) {
89
            if (isset($page['revisions'][0]['content'])) {
90
                $result[$page['title']] = $page['revisions'][0]['content'];
91
            } else {
92
                $result[$page['title']] = '';
93
            }
94
        }
95
96
        return $result;
97
    }
98
99
    /**
100
     * Get revisions of a single page.
101
     * @param Page $page The page.
102
     * @param User|null $user Specify to get only revisions by the given user.
103
     * @return string[] Each member with keys: id, timestamp, length-
104
     */
105
    public function getRevisions(Page $page, User $user = null)
106
    {
107
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
108
        if ($this->cache->hasItem($cacheKey)) {
109
            return $this->cache->getItem($cacheKey)->get();
110
        }
111
112
        $this->stopwatch->start($cacheKey, 'XTools');
113
114
        $stmt = $this->getRevisionsStmt($page, $user);
115
        $result = $stmt->fetchAll();
116
117
        // Cache for 10 minutes, and return.
118
        $cacheItem = $this->cache->getItem($cacheKey)
119
            ->set($result)
120
            ->expiresAfter(new DateInterval('PT10M'));
121
        $this->cache->save($cacheItem);
122
        $this->stopwatch->stop($cacheKey);
123
124
        return $result;
125
    }
126
127
    /**
128
     * Get the statement for a single revision, so that you can iterate row by row.
129
     * @param Page $page The page.
130
     * @param User|null $user Specify to get only revisions by the given user.
131
     * @param int $limit Max number of revisions to process.
132
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
133
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
134
     *   a separate query is ran to get the nuber of revisions.
135
     * @return Doctrine\DBAL\Driver\PDOStatement
136
     */
137
    public function getRevisionsStmt(Page $page, User $user = null, $limit = null, $numRevisions = null)
138
    {
139
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
140
        $userClause = $user ? "revs.rev_user_text in (:username) AND " : "";
141
142
        // This sorts ascending by rev_timestamp because ArticleInfo must start with the oldest
143
        // revision and work its way forward for proper processing. Consequently, if we want to do
144
        // a LIMIT we want the most recent revisions, so we also need to know the total count to
145
        // supply as the OFFSET.
146
        $limitClause = '';
147
        if (intval($limit) > 0 && isset($numRevisions)) {
148
            $offset = $numRevisions - $limit;
149
            $limitClause = "LIMIT $offset, $limit";
150
        }
151
152
        $sql = "SELECT
153
                    revs.rev_id AS id,
154
                    revs.rev_timestamp AS timestamp,
155
                    revs.rev_minor_edit AS minor,
156
                    revs.rev_len AS length,
157
                    (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
158
                    revs.rev_user AS user_id,
159
                    revs.rev_user_text AS username,
160
                    revs.rev_comment AS comment
161
                FROM $revTable AS revs
162
                LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
163
                WHERE $userClause revs.rev_page = :pageid
164
                ORDER BY revs.rev_timestamp ASC
165
                $limitClause";
166
167
        $params = ['pageid' => $page->getId()];
168
        if ($user) {
169
            $params['username'] = $user->getUsername();
170
        }
171
172
        $conn = $this->getProjectsConnection();
173
        return $conn->executeQuery($sql, $params);
174
    }
175
176
    /**
177
     * Get a count of the number of revisions of a single page
178
     * @param Page $page The page.
179
     * @param User|null $user Specify to only count revisions by the given user.
180
     * @return int
181
     */
182
    public function getNumRevisions(Page $page, User $user = null)
183
    {
184
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
185
        if ($this->cache->hasItem($cacheKey)) {
186
            return $this->cache->getItem($cacheKey)->get();
187
        }
188
189
        $revTable = $page->getProject()->getTableName('revision');
190
        $userClause = $user ? "rev_user_text in (:username) AND " : "";
191
192
        $sql = "SELECT COUNT(*)
193
                FROM $revTable
194
                WHERE $userClause rev_page = :pageid";
195
        $params = ['pageid' => $page->getId()];
196
        if ($user) {
197
            $params['username'] = $user->getUsername();
198
        }
199
200
        $conn = $this->getProjectsConnection();
201
        $result = $conn->executeQuery($sql, $params)->fetchColumn(0);
202
203
        // Cache for 10 minutes, and return.
204
        $cacheItem = $this->cache->getItem($cacheKey)
205
            ->set($result)
206
            ->expiresAfter(new DateInterval('PT10M'));
207
        $this->cache->save($cacheItem);
208
        return $result;
209
    }
210
211
    /**
212
     * Get various basic info used in the API, including the
213
     *   number of revisions, unique authors, initial author
214
     *   and edit count of the initial author.
215
     * This is combined into one query for better performance.
216
     * Caching is only applied if it took considerable time to process,
217
     *   because using the gadget, this will get hit for a different page
218
     *   constantly, where the likelihood of cache benefiting us is slim.
219
     * @param Page $page The page.
220
     * @return string[]
221
     */
222
    public function getBasicEditingInfo(Page $page)
223
    {
224
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_basicinfo');
225
        if ($this->cache->hasItem($cacheKey)) {
226
            return $this->cache->getItem($cacheKey)->get();
227
        }
228
229
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
230
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
231
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
232
233
        $sql = "SELECT *, (
234
                   SELECT user_editcount
235
                   FROM $userTable
236
                   WHERE user_name = author
237
                ) AS author_editcount
238
                FROM (
239
                    (
240
                        SELECT COUNT(*) AS num_edits,
241
                               COUNT(DISTINCT(rev_user_text)) AS num_editors
242
                        FROM $revTable
243
                        WHERE rev_page = :pageid
244
                    ) a,
245
                    (
246
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
247
                        #   and the lowest rev_id may not be the first revision.
248
                        SELECT rev_user_text AS author,
249
                               rev_timestamp AS created_at,
250
                               rev_id AS created_rev_id
251
                        FROM $revTable
252
                        WHERE rev_page = :pageid
253
                        ORDER BY rev_timestamp ASC
254
                        LIMIT 1
255
                    ) b,
256
                    (
257
                        SELECT MAX(rev_timestamp) AS modified_at
258
                        FROM $revTable
259
                        WHERE rev_page = :pageid
260
                    ) c,
261
                    (
262
                        SELECT page_latest AS modified_rev_id
263
                        FROM $pageTable
264
                        WHERE page_id = :pageid
265
                    ) d
266
                );";
267
        $params = ['pageid' => $page->getId()];
268
        $conn = $this->getProjectsConnection();
269
270
        // Get current time so we can compare timestamps
271
        // and decide whether or to cache the result.
272
        $time1 = time();
273
        $result = $conn->executeQuery($sql, $params)->fetch();
274
        $time2 = time();
275
276
        // If it took over 5 seconds, cache the result for 20 minutes.
277
        if ($time2 - $time1 > 5) {
278
            $cacheItem = $this->cache->getItem($cacheKey)
279
                ->set($result)
280
                ->expiresAfter(new DateInterval('PT20M'));
281
            $this->cache->save($cacheItem);
282
            $this->stopwatch->stop($cacheKey);
283
        }
284
285
        return $result;
286
    }
287
288
    /**
289
     * Get assessment data for the given pages
290
     * @param Project   $project The project to which the pages belong.
291
     * @param  int[]    $pageIds Page IDs
292
     * @return string[] Assessment data as retrieved from the database.
293
     */
294
    public function getAssessments(Project $project, $pageIds)
295
    {
296
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_assessments');
297
        if ($this->cache->hasItem($cacheKey)) {
298
            return $this->cache->getItem($cacheKey)->get();
299
        }
300
301
        if (!$project->hasPageAssessments()) {
302
            return [];
303
        }
304
        $paTable = $this->getTableName($project->getDatabaseName(), 'page_assessments');
305
        $papTable = $this->getTableName($project->getDatabaseName(), 'page_assessments_projects');
306
        $pageIds = implode($pageIds, ',');
307
308
        $query = "SELECT pap_project_title AS wikiproject, pa_class AS class, pa_importance AS importance
309
                  FROM $paTable
310
                  LEFT JOIN $papTable ON pa_project_id = pap_project_id
311
                  WHERE pa_page_id IN ($pageIds)";
312
313
        $conn = $this->getProjectsConnection();
314
        $result = $conn->executeQuery($query)->fetchAll();
315
316
        // Cache for 10 minutes, and return.
317
        $cacheItem = $this->cache->getItem($cacheKey)
318
            ->set($result)
319
            ->expiresAfter(new DateInterval('PT10M'));
320
        $this->cache->save($cacheItem);
321
        return $result;
322
    }
323
324
    /**
325
     * Get any CheckWiki errors of a single page
326
     * @param Page $page
327
     * @return array Results from query
328
     */
329
    public function getCheckWikiErrors(Page $page)
330
    {
331
        // Only support mainspace on Labs installations
332
        if ($page->getNamespace() !== 0 || !$this->isLabs()) {
333
            return [];
334
        }
335
336
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
337
                FROM s51080__checkwiki_p.cw_error a
338
                JOIN s51080__checkwiki_p.cw_overview_errors b
339
                WHERE a.project = b.project
340
                AND a.project = :dbName
341
                AND a.title = :title
342
                AND a.error = b.id
343
                AND a.ok = 0";
344
345
        // remove _p if present
346
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
347
348
        // Page title without underscores (str_replace just to be sure)
349
        $pageTitle = str_replace('_', ' ', $page->getTitle());
350
351
        $resultQuery = $this->getToolsConnection()->prepare($sql);
352
        $resultQuery->bindParam(':dbName', $dbName);
353
        $resultQuery->bindParam(':title', $pageTitle);
354
        $resultQuery->execute();
355
356
        return $resultQuery->fetchAll();
357
    }
358
359
    /**
360
     * Get basic wikidata on the page: label and description.
361
     * @param Page $page
362
     * @return string[] In the format:
363
     *    [[
364
     *         'term' => string such as 'label',
365
     *         'term_text' => string (value for 'label'),
366
     *     ], ... ]
367
     */
368
    public function getWikidataInfo(Page $page)
369
    {
370
        if (empty($page->getWikidataId())) {
371
            return [];
372
        }
373
374
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
375
        $lang = $page->getProject()->getLang();
376
377
        $sql = "SELECT IF(term_type = 'label', 'label', 'description') AS term, term_text
378
                FROM wikidatawiki_p.wb_entity_per_page
379
                JOIN wikidatawiki_p.page ON epp_page_id = page_id
380
                JOIN wikidatawiki_p.wb_terms ON term_entity_id = epp_entity_id
381
                    AND term_language = :lang
382
                    AND term_type IN ('label', 'description')
383
                WHERE epp_entity_id = :wikidataId
384
385
                UNION
386
387
                SELECT pl_title AS term, wb_terms.term_text
388
                FROM wikidatawiki_p.pagelinks
389
                JOIN wikidatawiki_p.wb_terms ON term_entity_id = SUBSTRING(pl_title, 2)
390
                    AND term_entity_type = (IF(SUBSTRING(pl_title, 1, 1) = 'Q', 'item', 'property'))
391
                    AND term_language = :lang
392
                    AND term_type = 'label'
393
                WHERE pl_namespace IN (0, 120)
394
                    AND pl_from = (
395
                        SELECT page_id FROM wikidatawiki_p.page
396
                        WHERE page_namespace = 0
397
                            AND page_title = 'Q:wikidataId'
398
                    )";
399
400
        $resultQuery = $this->getProjectsConnection()->prepare($sql);
401
        $resultQuery->bindParam(':lang', $lang);
402
        $resultQuery->bindParam(':wikidataId', $wikidataId);
403
        $resultQuery->execute();
404
405
        return $resultQuery->fetchAll();
406
    }
407
408
    /**
409
     * Get or count all wikidata items for the given page,
410
     *     not just languages of sister projects
411
     * @param Page $page
412
     * @param bool $count Set to true to get only a COUNT
413
     * @return string[]|int Records as returend by the DB,
414
     *                      or raw COUNT of the records.
415
     */
416
    public function getWikidataItems(Page $page, $count = false)
417
    {
418
        if (!$page->getWikidataId()) {
419
            return $count ? 0 : [];
420
        }
421
422
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
423
424
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
425
                FROM wikidatawiki_p.wb_items_per_site
426
                WHERE ips_item_id = :wikidataId";
427
428
        $resultQuery = $this->getProjectsConnection()->prepare($sql);
429
        $resultQuery->bindParam(':wikidataId', $wikidataId);
430
        $resultQuery->execute();
431
432
        $result = $resultQuery->fetchAll();
433
434
        return $count ? (int) $result[0]['count'] : $result;
435
    }
436
437
    /**
438
     * Get number of in and outgoing links and redirects to the given page.
439
     * @param Page $page
440
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
441
     *                  'links_in_count' and 'redirects_count'
442
     */
443
    public function countLinksAndRedirects(Page $page)
444
    {
445
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
446
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
447
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
448
449
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
450
                FROM $externalLinksTable WHERE el_from = :id
451
                UNION
452
                SELECT COUNT(*) AS value, 'links_out' AS type
453
                FROM $pageLinksTable WHERE pl_from = :id
454
                UNION
455
                SELECT COUNT(*) AS value, 'links_in' AS type
456
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
457
                UNION
458
                SELECT COUNT(*) AS value, 'redirects' AS type
459
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
460
461
        $params = [
462
            'id' => $page->getId(),
463
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
464
            'namespace' => $page->getNamespace(),
465
        ];
466
467
        $conn = $this->getProjectsConnection();
468
        $res = $conn->executeQuery($sql, $params);
469
470
        $data = [];
471
472
        // Transform to associative array by 'type'
473
        foreach ($res as $row) {
474
            $data[$row['type'] . '_count'] = $row['value'];
475
        }
476
477
        return $data;
478
    }
479
480
    /**
481
     * Count wikidata items for the given page, not just languages of sister projects
482
     * @param Page $page
483
     * @return int Number of records.
484
     */
485
    public function countWikidataItems(Page $page)
486
    {
487
        return $this->getWikidataItems($page, true);
488
    }
489
490
    /**
491
     * Get page views for the given page and timeframe.
492
     * @FIXME use Symfony Guzzle package.
493
     * @param Page $page
494
     * @param string|DateTime $start In the format YYYYMMDD
495
     * @param string|DateTime $end In the format YYYYMMDD
496
     * @return string[]
497
     */
498
    public function getPageviews(Page $page, $start, $end)
499
    {
500
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
501
        $client = new GuzzleHttp\Client();
502
503
        if ($start instanceof DateTime) {
0 ignored issues
show
Bug introduced by
The class Xtools\DateTime does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
504
            $start = $start->format('YYYYMMDD');
505
        }
506
        if ($end instanceof DateTime) {
0 ignored issues
show
Bug introduced by
The class Xtools\DateTime does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
507
            $end = $end->format('YYYYMMDD');
508
        }
509
510
        $project = $page->getProject()->getDomain();
511
512
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
513
            "$project/all-access/user/$title/daily/$start/$end";
514
515
        $res = $client->request('GET', $url);
516
        return json_decode($res->getBody()->getContents(), true);
517
    }
518
}
519