Passed
Push — master ( 2f87b4...a0f968 )
by MusikAnimal
05:26
created

PageRepository::getAssessments()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 23
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 14
nc 3
nop 2
dl 0
loc 23
rs 9.0856
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
namespace Xtools;
7
8
use DateTime;
9
use Mediawiki\Api\SimpleRequest;
10
use GuzzleHttp;
11
12
/**
13
 * A PageRepository fetches data about Pages, either singularly or for multiple.
14
 * Despite the name, this does not have a direct correlation with the Pages tool.
15
 * @codeCoverageIgnore
16
 */
17
class PageRepository extends Repository
18
{
19
20
    /**
21
     * Get metadata about a single page from the API.
22
     * @param Project $project The project to which the page belongs.
23
     * @param string $pageTitle Page title.
24
     * @return string[] Array with some of the following keys: pageid, title, missing, displaytitle,
25
     * url.
26
     */
27
    public function getPageInfo(Project $project, $pageTitle)
28
    {
29
        $info = $this->getPagesInfo($project, [$pageTitle]);
30
        return array_shift($info);
31
    }
32
33
    /**
34
     * Get metadata about a set of pages from the API.
35
     * @param Project $project The project to which the pages belong.
36
     * @param string[] $pageTitles Array of page titles.
37
     * @return string[] Array keyed by the page names, each element with some of the
38
     * following keys: pageid, title, missing, displaytitle, url.
39
     */
40
    public function getPagesInfo(Project $project, $pageTitles)
41
    {
42
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
43
        $params = [
44
            'prop' => 'info|pageprops',
45
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
46
            'converttitles' => '',
47
            // 'ellimit' => 20,
48
            // 'elexpandurl' => '',
49
            'titles' => join('|', $pageTitles),
50
            'formatversion' => 2
51
            // 'pageids' => $pageIds // FIXME: allow page IDs
52
        ];
53
54
        $query = new SimpleRequest('query', $params);
55
        $api = $this->getMediawikiApi($project);
56
        $res = $api->getRequest($query);
57
        $result = [];
58
        if (isset($res['query']['pages'])) {
59
            foreach ($res['query']['pages'] as $pageInfo) {
60
                $result[$pageInfo['title']] = $pageInfo;
61
            }
62
        }
63
        return $result;
64
    }
65
66
    /**
67
     * Get the full page text of a set of pages.
68
     * @param Project $project The project to which the pages belong.
69
     * @param string[] $pageTitles Array of page titles.
70
     * @return string[] Array keyed by the page names, with the page text as the values.
71
     */
72
    public function getPagesWikitext(Project $project, $pageTitles)
73
    {
74
        $query = new SimpleRequest('query', [
75
            'prop' => 'revisions',
76
            'rvprop' => 'content',
77
            'titles' => join('|', $pageTitles),
78
            'formatversion' => 2,
79
        ]);
80
        $result = [];
81
82
        $api = $this->getMediawikiApi($project);
83
        $res = $api->getRequest($query);
84
85
        if (!isset($res['query']['pages'])) {
86
            return [];
87
        }
88
89
        foreach ($res['query']['pages'] as $page) {
90
            if (isset($page['revisions'][0]['content'])) {
91
                $result[$page['title']] = $page['revisions'][0]['content'];
92
            } else {
93
                $result[$page['title']] = '';
94
            }
95
        }
96
97
        return $result;
98
    }
99
100
    /**
101
     * Get revisions of a single page.
102
     * @param Page $page The page.
103
     * @param User|null $user Specify to get only revisions by the given user.
104
     * @param false|int $start
105
     * @param false|int $end
106
     * @return string[] Each member with keys: id, timestamp, length.
107
     */
108
    public function getRevisions(Page $page, User $user = null, $start = false, $end = false)
109
    {
110
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
111
        if ($this->cache->hasItem($cacheKey)) {
112
            return $this->cache->getItem($cacheKey)->get();
113
        }
114
115
        $this->stopwatch->start($cacheKey, 'XTools');
116
117
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
118
        $result = $stmt->fetchAll();
119
120
        // Cache and return.
121
        $this->stopwatch->stop($cacheKey);
122
        return $this->setCache($cacheKey, $result);
123
    }
124
125
    /**
126
     * Get the statement for a single revision, so that you can iterate row by row.
127
     * @param Page $page The page.
128
     * @param User|null $user Specify to get only revisions by the given user.
129
     * @param int $limit Max number of revisions to process.
130
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
131
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
132
     *   a separate query is ran to get the nuber of revisions.
133
     * @param false|int $start
134
     * @param false|int $end
135
     * @return Doctrine\DBAL\Driver\PDOStatement
0 ignored issues
show
Bug introduced by
The type Xtools\Doctrine\DBAL\Driver\PDOStatement was not found. Did you mean Doctrine\DBAL\Driver\PDOStatement? If so, make sure to prefix the type with \.
Loading history...
136
     */
137
    public function getRevisionsStmt(
138
        Page $page,
139
        User $user = null,
140
        $limit = null,
141
        $numRevisions = null,
142
        $start = false,
143
        $end = false
144
    ) {
145
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
146
        $userClause = $user ? "revs.rev_user_text = :username AND " : "";
147
148
        // This sorts ascending by rev_timestamp because ArticleInfo must start with the oldest
149
        // revision and work its way forward for proper processing. Consequently, if we want to do
150
        // a LIMIT we want the most recent revisions, so we also need to know the total count to
151
        // supply as the OFFSET.
152
        $limitClause = '';
153
        if (intval($limit) > 0 && isset($numRevisions)) {
154
            $offset = $numRevisions - $limit;
155
            $limitClause = "LIMIT $offset, $limit";
156
        }
157
158
        $datesConditions = $this->getDateConditions($start, $end, 'revs.');
159
160
        $sql = "SELECT
161
                    revs.rev_id AS id,
162
                    revs.rev_timestamp AS timestamp,
163
                    revs.rev_minor_edit AS minor,
164
                    revs.rev_len AS length,
165
                    (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
166
                    revs.rev_user AS user_id,
167
                    revs.rev_user_text AS username,
168
                    revs.rev_comment AS comment,
169
                    revs.rev_sha1 AS sha
170
                FROM $revTable AS revs
171
                LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
172
                WHERE $userClause revs.rev_page = :pageid $datesConditions
173
                ORDER BY revs.rev_timestamp ASC
174
                $limitClause";
175
176
        $params = ['pageid' => $page->getId()];
177
        if ($user) {
178
            $params['username'] = $user->getUsername();
179
        }
180
181
        return $this->executeProjectsQuery($sql, $params);
182
    }
183
184
    /**
185
     * Get a count of the number of revisions of a single page
186
     * @param Page $page The page.
187
     * @param User|null $user Specify to only count revisions by the given user.
188
     * @param false|int $start
189
     * @param false|int $end
190
     * @return int
191
     */
192
    public function getNumRevisions(Page $page, User $user = null, $start = false, $end = false)
193
    {
194
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
195
        if ($this->cache->hasItem($cacheKey)) {
196
            return $this->cache->getItem($cacheKey)->get();
197
        }
198
199
        $revTable = $page->getProject()->getTableName('revision');
200
        $userClause = $user ? "rev_user_text = :username AND " : "";
201
202
        $datesConditions = $this->getDateConditions($start, $end);
203
204
        $sql = "SELECT COUNT(*)
205
                FROM $revTable
206
                WHERE $userClause rev_page = :pageid $datesConditions";
207
        $params = ['pageid' => $page->getId()];
208
        if ($user) {
209
            $params['username'] = $user->getUsername();
210
        }
211
212
        $result = $this->executeProjectsQuery($sql, $params)->fetchColumn(0);
213
214
        // Cache and return.
215
        return $this->setCache($cacheKey, $result);
216
    }
217
218
    /**
219
     * Get various basic info used in the API, including the
220
     *   number of revisions, unique authors, initial author
221
     *   and edit count of the initial author.
222
     * This is combined into one query for better performance.
223
     * Caching is only applied if it took considerable time to process,
224
     *   because using the gadget, this will get hit for a different page
225
     *   constantly, where the likelihood of cache benefiting us is slim.
226
     * @param Page $page The page.
227
     * @return string[]
228
     */
229
    public function getBasicEditingInfo(Page $page)
230
    {
231
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_basicinfo');
232
        if ($this->cache->hasItem($cacheKey)) {
233
            return $this->cache->getItem($cacheKey)->get();
234
        }
235
236
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
237
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
238
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
239
240
        $sql = "SELECT *, (
241
                   SELECT user_editcount
242
                   FROM $userTable
243
                   WHERE user_name = author
244
                ) AS author_editcount
245
                FROM (
246
                    (
247
                        SELECT COUNT(*) AS num_edits,
248
                               COUNT(DISTINCT(rev_user_text)) AS num_editors
249
                        FROM $revTable
250
                        WHERE rev_page = :pageid
251
                    ) a,
252
                    (
253
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
254
                        #   and the lowest rev_id may not be the first revision.
255
                        SELECT rev_user_text AS author,
256
                               rev_timestamp AS created_at,
257
                               rev_id AS created_rev_id
258
                        FROM $revTable
259
                        WHERE rev_page = :pageid
260
                        ORDER BY rev_timestamp ASC
261
                        LIMIT 1
262
                    ) b,
263
                    (
264
                        SELECT rev_timestamp AS modified_at,
265
                               rev_id AS modified_rev_id
266
                        FROM $revTable
267
                        JOIN $pageTable ON page_id = rev_page
268
                        WHERE rev_page = :pageid
269
                        AND rev_id = page_latest
270
                    ) c
271
                );";
272
        $params = ['pageid' => $page->getId()];
273
274
        // Get current time so we can compare timestamps
275
        // and decide whether or to cache the result.
276
        $time1 = time();
277
278
        /**
279
         * This query can sometimes take too long to run for pages with tens of thousands
280
         * of revisions. This query is used by the ArticleInfo gadget, which shows basic
281
         * data in real-time, so if it takes too long than the user probably didn't even
282
         * wait to see the result. We'll pass 60 as the last parameter to executeProjectsQuery,
283
         * which will set the max_statement_time to 60 seconds.
284
         */
285
        $result = $this->executeProjectsQuery($sql, $params, 60)->fetch();
286
287
        $time2 = time();
288
289
        // If it took over 5 seconds, cache the result for 20 minutes.
290
        if ($time2 - $time1 > 5) {
291
            $this->setCache($cacheKey, $result, 'PT20M');
292
        }
293
294
        return $result;
295
    }
296
297
    /**
298
     * Get any CheckWiki errors of a single page
299
     * @param Page $page
300
     * @return array Results from query
301
     */
302
    public function getCheckWikiErrors(Page $page)
303
    {
304
        // Only support mainspace on Labs installations
305
        if ($page->getNamespace() !== 0 || !$this->isLabs()) {
306
            return [];
307
        }
308
309
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
310
                FROM s51080__checkwiki_p.cw_error a
311
                JOIN s51080__checkwiki_p.cw_overview_errors b
312
                WHERE a.project = b.project
313
                AND a.project = :dbName
314
                AND a.title = :title
315
                AND a.error = b.id
316
                AND a.ok = 0";
317
318
        // remove _p if present
319
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
320
321
        // Page title without underscores (str_replace just to be sure)
322
        $pageTitle = str_replace('_', ' ', $page->getTitle());
323
324
        $resultQuery = $this->getToolsConnection()->prepare($sql);
325
        $resultQuery->bindParam(':dbName', $dbName);
326
        $resultQuery->bindParam(':title', $pageTitle);
327
        $resultQuery->execute();
328
329
        return $resultQuery->fetchAll();
330
    }
331
332
    /**
333
     * Get basic wikidata on the page: label and description.
334
     * @param Page $page
335
     * @return string[] In the format:
336
     *    [[
337
     *         'term' => string such as 'label',
338
     *         'term_text' => string (value for 'label'),
339
     *     ], ... ]
340
     */
341
    public function getWikidataInfo(Page $page)
342
    {
343
        if (empty($page->getWikidataId())) {
344
            return [];
345
        }
346
347
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
348
        $lang = $page->getProject()->getLang();
349
350
        $sql = "SELECT term_type AS term, term_text
351
                FROM wikidatawiki_p.wb_terms
352
                WHERE term_entity_id = :wikidataId
353
                AND term_type IN ('label', 'description')
354
                AND term_language = :lang";
355
356
        return $this->executeProjectsQuery($sql, [
357
            'lang' => $lang,
358
            'wikidataId' => $wikidataId,
359
        ])->fetchAll();
360
    }
361
362
    /**
363
     * Get or count all wikidata items for the given page,
364
     *     not just languages of sister projects
365
     * @param Page $page
366
     * @param bool $count Set to true to get only a COUNT
367
     * @return string[]|int Records as returend by the DB,
368
     *                      or raw COUNT of the records.
369
     */
370
    public function getWikidataItems(Page $page, $count = false)
371
    {
372
        if (!$page->getWikidataId()) {
373
            return $count ? 0 : [];
374
        }
375
376
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
377
378
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
379
                FROM wikidatawiki_p.wb_items_per_site
380
                WHERE ips_item_id = :wikidataId";
381
382
        $result = $this->executeProjectsQuery($sql, [
383
            'wikidataId' => $wikidataId,
384
        ])->fetchAll();
385
386
        return $count ? (int) $result[0]['count'] : $result;
387
    }
388
389
    /**
390
     * Get number of in and outgoing links and redirects to the given page.
391
     * @param Page $page
392
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
393
     *                  'links_in_count' and 'redirects_count'
394
     */
395
    public function countLinksAndRedirects(Page $page)
396
    {
397
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
398
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
399
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
400
401
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
402
                FROM $externalLinksTable WHERE el_from = :id
403
                UNION
404
                SELECT COUNT(*) AS value, 'links_out' AS type
405
                FROM $pageLinksTable WHERE pl_from = :id
406
                UNION
407
                SELECT COUNT(*) AS value, 'links_in' AS type
408
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
409
                UNION
410
                SELECT COUNT(*) AS value, 'redirects' AS type
411
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
412
413
        $params = [
414
            'id' => $page->getId(),
415
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
416
            'namespace' => $page->getNamespace(),
417
        ];
418
419
        $res = $this->executeProjectsQuery($sql, $params);
420
        $data = [];
421
422
        // Transform to associative array by 'type'
423
        foreach ($res as $row) {
424
            $data[$row['type'] . '_count'] = $row['value'];
425
        }
426
427
        return $data;
428
    }
429
430
    /**
431
     * Count wikidata items for the given page, not just languages of sister projects
432
     * @param Page $page
433
     * @return int Number of records.
434
     */
435
    public function countWikidataItems(Page $page)
436
    {
437
        return $this->getWikidataItems($page, true);
438
    }
439
440
    /**
441
     * Get page views for the given page and timeframe.
442
     * @FIXME use Symfony Guzzle package.
443
     * @param Page $page
444
     * @param string|DateTime $start In the format YYYYMMDD
445
     * @param string|DateTime $end In the format YYYYMMDD
446
     * @return string[]
447
     */
448
    public function getPageviews(Page $page, $start, $end)
449
    {
450
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
451
        $client = new GuzzleHttp\Client();
452
453
        if ($start instanceof DateTime) {
454
            $start = $start->format('Ymd');
455
        } else {
456
            $start = (new DateTime($start))->format('Ymd');
457
        }
458
        if ($end instanceof DateTime) {
459
            $end = $end->format('Ymd');
460
        } else {
461
            $end = (new DateTime($end))->format('Ymd');
462
        }
463
464
        $project = $page->getProject()->getDomain();
465
466
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
467
            "$project/all-access/user/$title/daily/$start/$end";
468
469
        $res = $client->request('GET', $url);
470
        return json_decode($res->getBody()->getContents(), true);
471
    }
472
473
    /**
474
     * Get the full HTML content of the the page.
475
     * @param  Page $page
476
     * @param  int $revId What revision to query for.
477
     * @return string
478
     */
479
    public function getHTMLContent(Page $page, $revId = null)
480
    {
481
        $client = new GuzzleHttp\Client();
482
        $url = $page->getUrl();
483
        if ($revId !== null) {
484
            $url .= "?oldid=$revId";
485
        }
486
        return $client->request('GET', $url)
487
            ->getBody()
488
            ->getContents();
489
    }
490
491
    /**
492
     * Get the ID of the revision of a page at the time of the given DateTime.
493
     * @param  Page     $page
494
     * @param  DateTime $date
495
     * @return int
496
     */
497
    public function getRevisionIdAtDate(Page $page, DateTime $date)
498
    {
499
        $revisionTable = $page->getProject()->getTableName('revision');
500
        $pageId = $page->getId();
501
        $datestamp = $date->format('YmdHis');
502
        $sql = "SELECT MAX(rev_id)
503
                FROM $revisionTable
504
                WHERE rev_timestamp <= $datestamp
505
                AND rev_page = $pageId LIMIT 1;";
506
        $resultQuery = $this->getProjectsConnection()->query($sql);
507
        return (int)$resultQuery->fetchColumn();
508
    }
509
510
    /**
511
     * Get HTML display titles of a set of pages (or the normal title if there's no display title).
512
     * This will send t/50 API requests where t is the number of titles supplied.
513
     * @param Project $project The project.
514
     * @param string[] $pageTitles The titles to fetch.
515
     * @return string[] Keys are the original supplied title, and values are the display titles.
516
     * @static
517
     */
518
    public static function displayTitles(Project $project, $pageTitles)
519
    {
520
        $api = $project->getApi();
521
        $displayTitles = [];
522
        $numPages = count($pageTitles);
523
524
        for ($n = 0; $n < $numPages; $n += 50) {
525
            $titleSlice = array_slice($pageTitles, $n, 50);
526
            $params = [
527
                'prop' => 'info|pageprops',
528
                'inprop' => 'displaytitle',
529
                'titles' => join('|', $titleSlice),
530
            ];
531
            $query = new SimpleRequest('query', $params);
532
            $result = $api->postRequest($query);
533
534
            // Extract normalization info.
535
            $normalized = [];
536
            if (isset($result['query']['normalized'])) {
537
                array_map(
538
                    function ($e) use (&$normalized) {
539
                        $normalized[$e['to']] = $e['from'];
540
                    },
541
                    $result['query']['normalized']
542
                );
543
            }
544
545
            // Match up the normalized titles with the display titles and the original titles.
546
            foreach ($result['query']['pages'] as $pageInfo) {
547
                $displayTitle = isset($pageInfo['pageprops']['displaytitle'])
548
                    ? $pageInfo['pageprops']['displaytitle']
549
                    : $pageInfo['title'];
550
                $origTitle = isset($normalized[$pageInfo['title']])
551
                    ? $normalized[$pageInfo['title']] : $pageInfo['title'];
552
                $displayTitles[$origTitle] = $displayTitle;
553
            }
554
        }
555
556
        return $displayTitles;
557
    }
558
}
559