Completed
Pull Request — master (#114)
by MusikAnimal
02:18
created

PagesRepository::countWikidataItems()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
/**
3
 * This file contains only the PagesRepository class.
4
 */
5
6
namespace Xtools;
7
8
use DateInterval;
9
use Mediawiki\Api\SimpleRequest;
10
use GuzzleHttp;
11
12
/**
13
 * A PagesRepository fetches data about Pages, either singularly or for multiple.
14
 */
15
class PagesRepository extends Repository
16
{
17
18
    /**
19
     * Get metadata about a single page from the API.
20
     * @param Project $project The project to which the page belongs.
21
     * @param string $pageTitle Page title.
22
     * @return string[] Array with some of the following keys: pageid, title, missing, displaytitle,
23
     * url.
24
     */
25
    public function getPageInfo(Project $project, $pageTitle)
26
    {
27
        $info = $this->getPagesInfo($project, [$pageTitle]);
28
        return array_shift($info);
29
    }
30
31
    /**
32
     * Get metadata about a set of pages from the API.
33
     * @param Project $project The project to which the pages belong.
34
     * @param string[] $pageTitles Array of page titles.
35
     * @return string[] Array keyed by the page names, each element with some of the
36
     * following keys: pageid, title, missing, displaytitle, url.
37
     */
38
    public function getPagesInfo(Project $project, $pageTitles)
39
    {
40
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
41
        $params = [
42
            'prop' => 'info|pageprops',
43
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
44
            'converttitles' => '',
45
            // 'ellimit' => 20,
46
            // 'elexpandurl' => '',
47
            'titles' => join('|', $pageTitles),
48
            'formatversion' => 2
49
            // 'pageids' => $pageIds // FIXME: allow page IDs
50
        ];
51
52
        $query = new SimpleRequest('query', $params);
53
        $api = $this->getMediawikiApi($project);
54
        $res = $api->getRequest($query);
55
        $result = [];
56
        if (isset($res['query']['pages'])) {
57
            foreach ($res['query']['pages'] as $pageInfo) {
58
                $result[$pageInfo['title']] = $pageInfo;
59
            }
60
        }
61
        return $result;
62
    }
63
64
    /**
65
     * Get the full page text of a set of pages.
66
     * @param Project $project The project to which the pages belong.
67
     * @param string[] $pageTitles Array of page titles.
68
     * @return string[] Array keyed by the page names, with the page text as the values.
69
     */
70
    public function getPagesWikitext(Project $project, $pageTitles)
71
    {
72
        $query = new SimpleRequest('query', [
73
            'prop' => 'revisions',
74
            'rvprop' => 'content',
75
            'titles' => join('|', $pageTitles),
76
            'formatversion' => 2,
77
        ]);
78
        $result = [];
79
80
        $api = $this->getMediawikiApi($project);
81
        $res = $api->getRequest($query);
82
83
        if (!isset($res['query']['pages'])) {
84
            return [];
85
        }
86
87
        foreach ($res['query']['pages'] as $page) {
88
            if (isset($page['revisions'][0]['content'])) {
89
                $result[$page['title']] = $page['revisions'][0]['content'];
90
            } else {
91
                $result[$page['title']] = '';
92
            }
93
        }
94
95
        return $result;
96
    }
97
98
    /**
99
     * Get revisions of a single page.
100
     * @param Page $page The page.
101
     * @param User|null $user Specify to get only revisions by the given user.
102
     * @return string[] Each member with keys: id, timestamp, length-
103
     */
104
    public function getRevisions(Page $page, User $user = null)
105
    {
106
        $cacheKey = 'revisions.'.$page->getId();
107
        if ($user) {
108
            $cacheKey .= '.'.$user->getCacheKey();
109
        }
110
111
        if ($this->cache->hasItem($cacheKey)) {
112
            return $this->cache->getItem($cacheKey)->get();
113
        }
114
115
        $this->stopwatch->start($cacheKey, 'XTools');
116
117
        $stmt = $this->getRevisionsStmt($page, $user);
118
        $result = $stmt->fetchAll();
119
120
        // Cache for 10 minutes, and return.
121
        $cacheItem = $this->cache->getItem($cacheKey)
122
            ->set($result)
123
            ->expiresAfter(new DateInterval('PT10M'));
124
        $this->cache->save($cacheItem);
125
        $this->stopwatch->stop($cacheKey);
126
127
        return $result;
128
    }
129
130
    /**
131
     * Get the statement for a single revision, so that you can iterate row by row.
132
     * @param Page $page The page.
133
     * @param User|null $user Specify to get only revisions by the given user.
134
     * @param int $limit Max number of revisions to process.
135
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
136
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
137
     *   a separate query is ran to get the nuber of revisions.
138
     * @return Doctrine\DBAL\Driver\PDOStatement
139
     */
140
    public function getRevisionsStmt(Page $page, User $user = null, $limit = null, $numRevisions = null)
141
    {
142
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
143
        $userClause = $user ? "revs.rev_user_text in (:username) AND " : "";
144
145
        // This sorts ascending by rev_timestamp because ArticleInfo must start with the oldest
146
        // revision and work its way forward for proper processing. Consequently, if we want to do
147
        // a LIMIT we want the most recent revisions, so we also need to know the total count to
148
        // supply as the OFFSET.
149
        $limitClause = '';
150
        if (intval($limit) > 0 && isset($numRevisions)) {
151
            $offset = $numRevisions - $limit;
152
            $limitClause = "LIMIT $offset, $limit";
153
        }
154
155
        $sql = "SELECT
156
                    revs.rev_id AS id,
157
                    revs.rev_timestamp AS timestamp,
158
                    revs.rev_minor_edit AS minor,
159
                    revs.rev_len AS length,
160
                    (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
161
                    revs.rev_user AS user_id,
162
                    revs.rev_user_text AS username,
163
                    revs.rev_comment AS comment
164
                FROM $revTable AS revs
165
                LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
166
                WHERE $userClause revs.rev_page = :pageid
167
                ORDER BY revs.rev_timestamp ASC
168
                $limitClause";
169
170
        $params = ['pageid' => $page->getId()];
171
        if ($user) {
172
            $params['username'] = $user->getUsername();
173
        }
174
175
        $conn = $this->getProjectsConnection();
176
        return $conn->executeQuery($sql, $params);
177
    }
178
179
    /**
180
     * Get a count of the number of revisions of a single page
181
     * @param Page $page The page.
182
     * @param User|null $user Specify to only count revisions by the given user.
183
     * @return int
184
     */
185
    public function getNumRevisions(Page $page, User $user = null)
186
    {
187
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
188
        $userClause = $user ? "rev_user_text in (:username) AND " : "";
189
190
        $sql = "SELECT COUNT(*)
191
                FROM $revTable
192
                WHERE $userClause rev_page = :pageid";
193
        $params = ['pageid' => $page->getId()];
194
        if ($user) {
195
            $params['username'] = $user->getUsername();
196
        }
197
        $conn = $this->getProjectsConnection();
198
        return $conn->executeQuery($sql, $params)->fetchColumn(0);
199
    }
200
201
    /**
202
     * Get various basic info used in the API, including the
203
     *   number of revisions, unique authors, initial author
204
     *   and edit count of the initial author.
205
     * This is combined into one query for better performance.
206
     * Caching is only applied if it took considerable time to process,
207
     *   because using the gadget, this will get hit for a different page
208
     *   constantly, where the likelihood of cache benefiting us is slim.
209
     * @param Page $page The page.
210
     * @return string[]
211
     */
212
    public function getBasicEditingInfo(Page $page)
213
    {
214
        $cacheKey = 'page.basicInfo.'.$page->getId();
215
        if ($this->cache->hasItem($cacheKey)) {
216
            return $this->cache->getItem($cacheKey)->get();
217
        }
218
219
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
220
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
221
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
222
223
        $sql = "SELECT *, (
224
                   SELECT user_editcount
225
                   FROM $userTable
226
                   WHERE user_name = author
227
                ) AS author_editcount
228
                FROM (
229
                    (
230
                        SELECT COUNT(*) AS num_edits,
231
                               COUNT(DISTINCT(rev_user_text)) AS num_editors
232
                        FROM $revTable
233
                        WHERE rev_page = :pageid
234
                    ) a,
235
                    (
236
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
237
                        #   and the lowest rev_id may not be the first revision.
238
                        SELECT rev_user_text AS author,
239
                               rev_timestamp AS created_at,
240
                               rev_id AS created_rev_id
241
                        FROM $revTable
242
                        WHERE rev_page = :pageid
243
                        ORDER BY rev_timestamp ASC
244
                        LIMIT 1
245
                    ) b,
246
                    (
247
                        SELECT MAX(rev_timestamp) AS modified_at
248
                        FROM $revTable
249
                        WHERE rev_page = :pageid
250
                    ) c,
251
                    (
252
                        SELECT page_latest AS modified_rev_id
253
                        FROM $pageTable
254
                        WHERE page_id = :pageid
255
                    ) d
256
                );";
257
        $params = ['pageid' => $page->getId()];
258
        $conn = $this->getProjectsConnection();
259
260
        // Get current time so we can compare timestamps
261
        // and decide whether or to cache the result.
262
        $time1 = time();
263
        $result = $conn->executeQuery($sql, $params)->fetch();
264
        $time2 = time();
265
266
        // If it took over 5 seconds, cache the result for 20 minutes.
267
        if ($time2 - $time1 > 5) {
268
            $cacheItem = $this->cache->getItem($cacheKey)
269
                ->set($result)
270
                ->expiresAfter(new DateInterval('PT20M'));
271
            $this->cache->save($cacheItem);
272
            $this->stopwatch->stop($cacheKey);
273
        }
274
275
        return $result;
276
    }
277
278
    /**
279
     * Get assessment data for the given pages
280
     * @param Project   $project The project to which the pages belong.
281
     * @param  int[]    $pageIds Page IDs
282
     * @return string[] Assessment data as retrieved from the database.
283
     */
284
    public function getAssessments(Project $project, $pageIds)
285
    {
286
        if (!$project->hasPageAssessments()) {
287
            return [];
288
        }
289
        $paTable = $this->getTableName($project->getDatabaseName(), 'page_assessments');
290
        $papTable = $this->getTableName($project->getDatabaseName(), 'page_assessments_projects');
291
        $pageIds = implode($pageIds, ',');
292
293
        $query = "SELECT pap_project_title AS wikiproject, pa_class AS class, pa_importance AS importance
294
                  FROM $paTable
295
                  LEFT JOIN $papTable ON pa_project_id = pap_project_id
296
                  WHERE pa_page_id IN ($pageIds)";
297
298
        $conn = $this->getProjectsConnection();
299
        return $conn->executeQuery($query)->fetchAll();
300
    }
301
302
    /**
303
     * Get any CheckWiki errors of a single page
304
     * @param Page $page
305
     * @return array Results from query
306
     */
307
    public function getCheckWikiErrors(Page $page)
308
    {
309
        // Only support mainspace on Labs installations
310
        if ($page->getNamespace() !== 0 || !$this->isLabs()) {
311
            return [];
312
        }
313
314
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
315
                FROM s51080__checkwiki_p.cw_error a
316
                JOIN s51080__checkwiki_p.cw_overview_errors b
317
                WHERE a.project = b.project
318
                AND a.project = :dbName
319
                AND a.title = :title
320
                AND a.error = b.id
321
                AND a.ok = 0";
322
323
        // remove _p if present
324
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
325
326
        // Page title without underscores (str_replace just to be sure)
327
        $pageTitle = str_replace('_', ' ', $page->getTitle());
328
329
        $resultQuery = $this->getToolsConnection()->prepare($sql);
330
        $resultQuery->bindParam(':dbName', $dbName);
331
        $resultQuery->bindParam(':title', $pageTitle);
332
        $resultQuery->execute();
333
334
        return $resultQuery->fetchAll();
335
    }
336
337
    /**
338
     * Get basic wikidata on the page: label and description.
339
     * @param Page $page
340
     * @return string[] In the format:
341
     *    [[
342
     *         'term' => string such as 'label',
343
     *         'term_text' => string (value for 'label'),
344
     *     ], ... ]
345
     */
346
    public function getWikidataInfo(Page $page)
347
    {
348
        if (empty($page->getWikidataId())) {
349
            return [];
350
        }
351
352
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
353
        $lang = $page->getProject()->getLang();
354
355
        $sql = "SELECT IF(term_type = 'label', 'label', 'description') AS term, term_text
356
                FROM wikidatawiki_p.wb_entity_per_page
357
                JOIN wikidatawiki_p.page ON epp_page_id = page_id
358
                JOIN wikidatawiki_p.wb_terms ON term_entity_id = epp_entity_id
359
                    AND term_language = :lang
360
                    AND term_type IN ('label', 'description')
361
                WHERE epp_entity_id = :wikidataId
362
363
                UNION
364
365
                SELECT pl_title AS term, wb_terms.term_text
366
                FROM wikidatawiki_p.pagelinks
367
                JOIN wikidatawiki_p.wb_terms ON term_entity_id = SUBSTRING(pl_title, 2)
368
                    AND term_entity_type = (IF(SUBSTRING(pl_title, 1, 1) = 'Q', 'item', 'property'))
369
                    AND term_language = :lang
370
                    AND term_type = 'label'
371
                WHERE pl_namespace IN (0, 120)
372
                    AND pl_from = (
373
                        SELECT page_id FROM wikidatawiki_p.page
374
                        WHERE page_namespace = 0
375
                            AND page_title = 'Q:wikidataId'
376
                    )";
377
378
        $resultQuery = $this->getProjectsConnection()->prepare($sql);
379
        $resultQuery->bindParam(':lang', $lang);
380
        $resultQuery->bindParam(':wikidataId', $wikidataId);
381
        $resultQuery->execute();
382
383
        return $resultQuery->fetchAll();
384
    }
385
386
    /**
387
     * Get or count all wikidata items for the given page,
388
     *     not just languages of sister projects
389
     * @param Page $page
390
     * @param bool $count Set to true to get only a COUNT
391
     * @return string[]|int Records as returend by the DB,
392
     *                      or raw COUNT of the records.
393
     */
394
    public function getWikidataItems(Page $page, $count = false)
395
    {
396
        if (!$page->getWikidataId()) {
397
            return $count ? 0 : [];
398
        }
399
400
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
401
402
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
403
                FROM wikidatawiki_p.wb_items_per_site
404
                WHERE ips_item_id = :wikidataId";
405
406
        $resultQuery = $this->getProjectsConnection()->prepare($sql);
407
        $resultQuery->bindParam(':wikidataId', $wikidataId);
408
        $resultQuery->execute();
409
410
        $result = $resultQuery->fetchAll();
411
412
        return $count ? (int) $result[0]['count'] : $result;
413
    }
414
415
    /**
416
     * Get number of in and outgoing links and redirects to the given page.
417
     * @param Page $page
418
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
419
     *                  'links_in_count' and 'redirects_count'
420
     */
421
    public function countLinksAndRedirects(Page $page)
422
    {
423
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
424
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
425
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
426
427
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
428
                FROM $externalLinksTable WHERE el_from = :id
429
                UNION
430
                SELECT COUNT(*) AS value, 'links_out' AS type
431
                FROM $pageLinksTable WHERE pl_from = :id
432
                UNION
433
                SELECT COUNT(*) AS value, 'links_in' AS type
434
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
435
                UNION
436
                SELECT COUNT(*) AS value, 'redirects' AS type
437
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
438
439
        $params = [
440
            'id' => $page->getId(),
441
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
442
            'namespace' => $page->getNamespace(),
443
        ];
444
445
        $conn = $this->getProjectsConnection();
446
        $res = $conn->executeQuery($sql, $params);
447
448
        $data = [];
449
450
        // Transform to associative array by 'type'
451
        foreach ($res as $row) {
452
            $data[$row['type'] . '_count'] = $row['value'];
453
        }
454
455
        return $data;
456
    }
457
458
    /**
459
     * Count wikidata items for the given page, not just languages of sister projects
460
     * @param Page $page
461
     * @return int Number of records.
462
     */
463
    public function countWikidataItems(Page $page)
464
    {
465
        return $this->getWikidataItems($page, true);
466
    }
467
468
    /**
469
     * Get page views for the given page and timeframe.
470
     * @FIXME use Symfony Guzzle package.
471
     * @param Page $page
472
     * @param string|DateTime $start In the format YYYYMMDD
473
     * @param string|DateTime $end In the format YYYYMMDD
474
     * @return string[]
475
     */
476
    public function getPageviews(Page $page, $start, $end)
477
    {
478
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
479
        $client = new GuzzleHttp\Client();
480
481
        if ($start instanceof DateTime) {
0 ignored issues
show
Bug introduced by
The class Xtools\DateTime does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
482
            $start = $start->format('YYYYMMDD');
483
        }
484
        if ($end instanceof DateTime) {
0 ignored issues
show
Bug introduced by
The class Xtools\DateTime does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
485
            $end = $end->format('YYYYMMDD');
486
        }
487
488
        $project = $page->getProject()->getDomain();
489
490
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
491
            "$project/all-access/user/$title/daily/$start/$end";
492
493
        $res = $client->request('GET', $url);
494
        return json_decode($res->getBody()->getContents(), true);
495
    }
496
}
497