Passed
Push — master ( f43d54...b6518a )
by MusikAnimal
01:39
created

PageRepository::countWikidataItems()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
namespace Xtools;
7
8
use DateTime;
9
use DateInterval;
10
use Mediawiki\Api\SimpleRequest;
11
use GuzzleHttp;
12
13
/**
14
 * A PageRepository fetches data about Pages, either singularly or for multiple.
15
 * Despite the name, this does not have a direct correlation with the Pages tool.
16
 * @codeCoverageIgnore
17
 */
18
class PageRepository extends Repository
19
{
20
21
    /**
22
     * Get metadata about a single page from the API.
23
     * @param Project $project The project to which the page belongs.
24
     * @param string $pageTitle Page title.
25
     * @return string[] Array with some of the following keys: pageid, title, missing, displaytitle,
26
     * url.
27
     */
28
    public function getPageInfo(Project $project, $pageTitle)
29
    {
30
        $info = $this->getPagesInfo($project, [$pageTitle]);
31
        return array_shift($info);
32
    }
33
34
    /**
35
     * Get metadata about a set of pages from the API.
36
     * @param Project $project The project to which the pages belong.
37
     * @param string[] $pageTitles Array of page titles.
38
     * @return string[] Array keyed by the page names, each element with some of the
39
     * following keys: pageid, title, missing, displaytitle, url.
40
     */
41
    public function getPagesInfo(Project $project, $pageTitles)
42
    {
43
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
44
        $params = [
45
            'prop' => 'info|pageprops',
46
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
47
            'converttitles' => '',
48
            // 'ellimit' => 20,
49
            // 'elexpandurl' => '',
50
            'titles' => join('|', $pageTitles),
51
            'formatversion' => 2
52
            // 'pageids' => $pageIds // FIXME: allow page IDs
53
        ];
54
55
        $query = new SimpleRequest('query', $params);
56
        $api = $this->getMediawikiApi($project);
57
        $res = $api->getRequest($query);
58
        $result = [];
59
        if (isset($res['query']['pages'])) {
60
            foreach ($res['query']['pages'] as $pageInfo) {
61
                $result[$pageInfo['title']] = $pageInfo;
62
            }
63
        }
64
        return $result;
65
    }
66
67
    /**
68
     * Get the full page text of a set of pages.
69
     * @param Project $project The project to which the pages belong.
70
     * @param string[] $pageTitles Array of page titles.
71
     * @return string[] Array keyed by the page names, with the page text as the values.
72
     */
73
    public function getPagesWikitext(Project $project, $pageTitles)
74
    {
75
        $query = new SimpleRequest('query', [
76
            'prop' => 'revisions',
77
            'rvprop' => 'content',
78
            'titles' => join('|', $pageTitles),
79
            'formatversion' => 2,
80
        ]);
81
        $result = [];
82
83
        $api = $this->getMediawikiApi($project);
84
        $res = $api->getRequest($query);
85
86
        if (!isset($res['query']['pages'])) {
87
            return [];
88
        }
89
90
        foreach ($res['query']['pages'] as $page) {
91
            if (isset($page['revisions'][0]['content'])) {
92
                $result[$page['title']] = $page['revisions'][0]['content'];
93
            } else {
94
                $result[$page['title']] = '';
95
            }
96
        }
97
98
        return $result;
99
    }
100
101
    /**
102
     * Get revisions of a single page.
103
     * @param Page $page The page.
104
     * @param User|null $user Specify to get only revisions by the given user.
105
     * @param false|int $start
106
     * @param false|int $end
107
     * @return string[] Each member with keys: id, timestamp, length-
108
     */
109
    public function getRevisions(Page $page, User $user = null, $start = false, $end = false)
110
    {
111
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
112
        if ($this->cache->hasItem($cacheKey)) {
113
            return $this->cache->getItem($cacheKey)->get();
114
        }
115
116
        $this->stopwatch->start($cacheKey, 'XTools');
117
118
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
119
        $result = $stmt->fetchAll();
120
121
        // Cache for 10 minutes, and return.
122
        $cacheItem = $this->cache->getItem($cacheKey)
123
            ->set($result)
124
            ->expiresAfter(new DateInterval('PT10M'));
125
        $this->cache->save($cacheItem);
126
        $this->stopwatch->stop($cacheKey);
127
128
        return $result;
129
    }
130
131
    /**
132
     * Get the statement for a single revision, so that you can iterate row by row.
133
     * @param Page $page The page.
134
     * @param User|null $user Specify to get only revisions by the given user.
135
     * @param int $limit Max number of revisions to process.
136
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
137
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
138
     *   a separate query is ran to get the nuber of revisions.
139
     * @param false|int $start
140
     * @param false|int $end
141
     * @return Doctrine\DBAL\Driver\PDOStatement
0 ignored issues
show
Bug introduced by
The type Xtools\Doctrine\DBAL\Driver\PDOStatement was not found. Did you mean Doctrine\DBAL\Driver\PDOStatement? If so, make sure to prefix the type with \.
Loading history...
142
     */
143
    public function getRevisionsStmt(
144
        Page $page,
145
        User $user = null,
146
        $limit = null,
147
        $numRevisions = null,
148
        $start = false,
149
        $end = false
150
    ) {
151
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
152
        $userClause = $user ? "revs.rev_user_text in (:username) AND " : "";
153
154
        // This sorts ascending by rev_timestamp because ArticleInfo must start with the oldest
155
        // revision and work its way forward for proper processing. Consequently, if we want to do
156
        // a LIMIT we want the most recent revisions, so we also need to know the total count to
157
        // supply as the OFFSET.
158
        $limitClause = '';
159
        if (intval($limit) > 0 && isset($numRevisions)) {
160
            $offset = $numRevisions - $limit;
161
            $limitClause = "LIMIT $offset, $limit";
162
        }
163
164
        $datesConditions = $this->createDatesConditions($start, $end, 'revs.');
165
166
        $sql = "SELECT
167
                    revs.rev_id AS id,
168
                    revs.rev_timestamp AS timestamp,
169
                    revs.rev_minor_edit AS minor,
170
                    revs.rev_len AS length,
171
                    (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
172
                    revs.rev_user AS user_id,
173
                    revs.rev_user_text AS username,
174
                    revs.rev_comment AS comment,
175
                    revs.rev_sha1 AS sha
176
                FROM $revTable AS revs
177
                LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
178
                WHERE $userClause revs.rev_page = :pageid $datesConditions
179
                ORDER BY revs.rev_timestamp ASC
180
                $limitClause";
181
182
        $params = ['pageid' => $page->getId()];
183
        if ($user) {
184
            $params['username'] = $user->getUsername();
185
        }
186
187
        $conn = $this->getProjectsConnection();
188
        return $conn->executeQuery($sql, $params);
189
    }
190
191
    /**
192
     * Get a count of the number of revisions of a single page
193
     * @param Page $page The page.
194
     * @param User|null $user Specify to only count revisions by the given user.
195
     * @param false|int $start
196
     * @param false|int $end
197
     * @return int
198
     */
199
    public function getNumRevisions(Page $page, User $user = null, $start = false, $end = false)
200
    {
201
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
202
        if ($this->cache->hasItem($cacheKey)) {
203
            return $this->cache->getItem($cacheKey)->get();
204
        }
205
206
        $revTable = $page->getProject()->getTableName('revision');
207
        $userClause = $user ? "rev_user_text in (:username) AND " : "";
208
209
        $datesConditions = $this->createDatesConditions($start, $end);
210
211
        $sql = "SELECT COUNT(*)
212
                FROM $revTable
213
                WHERE $userClause rev_page = :pageid $datesConditions";
214
        $params = ['pageid' => $page->getId()];
215
        if ($user) {
216
            $params['username'] = $user->getUsername();
217
        }
218
219
        $conn = $this->getProjectsConnection();
220
        $result = $conn->executeQuery($sql, $params)->fetchColumn(0);
221
222
        // Cache for 10 minutes, and return.
223
        $cacheItem = $this->cache->getItem($cacheKey)
224
            ->set($result)
225
            ->expiresAfter(new DateInterval('PT10M'));
226
        $this->cache->save($cacheItem);
227
        return $result;
228
    }
229
230
    /**
231
     * Get various basic info used in the API, including the
232
     *   number of revisions, unique authors, initial author
233
     *   and edit count of the initial author.
234
     * This is combined into one query for better performance.
235
     * Caching is only applied if it took considerable time to process,
236
     *   because using the gadget, this will get hit for a different page
237
     *   constantly, where the likelihood of cache benefiting us is slim.
238
     * @param Page $page The page.
239
     * @return string[]
240
     */
241
    public function getBasicEditingInfo(Page $page)
242
    {
243
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_basicinfo');
244
        if ($this->cache->hasItem($cacheKey)) {
245
            return $this->cache->getItem($cacheKey)->get();
246
        }
247
248
        $conn = $this->getProjectsConnection();
249
250
        /**
251
         * This query can sometimes take too long to run for pages with tens of thousands
252
         * of revisions. This query is used by the ArticleInfo gadget, which shows basic
253
         * data in real-time, so if it takes too long than the user probably didn't even
254
         * wait to see the result. We'll utilize the max_statement_time variable to set
255
         * a maximum query time of 60 seconds.
256
         */
257
        $sql = "SET max_statement_time = 60;";
258
        $conn->executeQuery($sql);
259
260
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
261
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
262
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
263
264
        $sql = "SELECT *, (
265
                   SELECT user_editcount
266
                   FROM $userTable
267
                   WHERE user_name = author
268
                ) AS author_editcount
269
                FROM (
270
                    (
271
                        SELECT COUNT(*) AS num_edits,
272
                               COUNT(DISTINCT(rev_user_text)) AS num_editors
273
                        FROM $revTable
274
                        WHERE rev_page = :pageid
275
                    ) a,
276
                    (
277
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
278
                        #   and the lowest rev_id may not be the first revision.
279
                        SELECT rev_user_text AS author,
280
                               rev_timestamp AS created_at,
281
                               rev_id AS created_rev_id
282
                        FROM $revTable
283
                        WHERE rev_page = :pageid
284
                        ORDER BY rev_timestamp ASC
285
                        LIMIT 1
286
                    ) b,
287
                    (
288
                        SELECT MAX(rev_timestamp) AS modified_at
289
                        FROM $revTable
290
                        WHERE rev_page = :pageid
291
                    ) c,
292
                    (
293
                        SELECT page_latest AS modified_rev_id
294
                        FROM $pageTable
295
                        WHERE page_id = :pageid
296
                    ) d
297
                );";
298
        $params = ['pageid' => $page->getId()];
299
300
        // Get current time so we can compare timestamps
301
        // and decide whether or to cache the result.
302
        $time1 = time();
303
        $result = $conn->executeQuery($sql, $params)->fetch();
304
        $time2 = time();
305
306
        // If it took over 5 seconds, cache the result for 20 minutes.
307
        if ($time2 - $time1 > 5) {
308
            $cacheItem = $this->cache->getItem($cacheKey)
309
                ->set($result)
310
                ->expiresAfter(new DateInterval('PT20M'));
311
            $this->cache->save($cacheItem);
312
        }
313
314
        return $result;
315
    }
316
317
    /**
318
     * Get assessment data for the given pages
319
     * @param Project   $project The project to which the pages belong.
320
     * @param  int[]    $pageIds Page IDs
321
     * @return string[] Assessment data as retrieved from the database.
322
     */
323
    public function getAssessments(Project $project, $pageIds)
324
    {
325
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_assessments');
326
        if ($this->cache->hasItem($cacheKey)) {
327
            return $this->cache->getItem($cacheKey)->get();
328
        }
329
330
        if (!$project->hasPageAssessments()) {
331
            return [];
332
        }
333
        $paTable = $this->getTableName($project->getDatabaseName(), 'page_assessments');
334
        $papTable = $this->getTableName($project->getDatabaseName(), 'page_assessments_projects');
335
        $pageIds = implode($pageIds, ',');
0 ignored issues
show
Bug introduced by
',' of type string is incompatible with the type array expected by parameter $pieces of implode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

335
        $pageIds = implode($pageIds, /** @scrutinizer ignore-type */ ',');
Loading history...
336
337
        $query = "SELECT pap_project_title AS wikiproject, pa_class AS class, pa_importance AS importance
338
                  FROM $paTable
339
                  LEFT JOIN $papTable ON pa_project_id = pap_project_id
340
                  WHERE pa_page_id IN ($pageIds)";
341
342
        $conn = $this->getProjectsConnection();
343
        $result = $conn->executeQuery($query)->fetchAll();
344
345
        // Cache for 10 minutes, and return.
346
        $cacheItem = $this->cache->getItem($cacheKey)
347
            ->set($result)
348
            ->expiresAfter(new DateInterval('PT10M'));
349
        $this->cache->save($cacheItem);
350
        return $result;
351
    }
352
353
    /**
354
     * Get any CheckWiki errors of a single page
355
     * @param Page $page
356
     * @return array Results from query
357
     */
358
    public function getCheckWikiErrors(Page $page)
359
    {
360
        // Only support mainspace on Labs installations
361
        if ($page->getNamespace() !== 0 || !$this->isLabs()) {
362
            return [];
363
        }
364
365
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
366
                FROM s51080__checkwiki_p.cw_error a
367
                JOIN s51080__checkwiki_p.cw_overview_errors b
368
                WHERE a.project = b.project
369
                AND a.project = :dbName
370
                AND a.title = :title
371
                AND a.error = b.id
372
                AND a.ok = 0";
373
374
        // remove _p if present
375
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
376
377
        // Page title without underscores (str_replace just to be sure)
378
        $pageTitle = str_replace('_', ' ', $page->getTitle());
379
380
        $resultQuery = $this->getToolsConnection()->prepare($sql);
381
        $resultQuery->bindParam(':dbName', $dbName);
382
        $resultQuery->bindParam(':title', $pageTitle);
383
        $resultQuery->execute();
384
385
        return $resultQuery->fetchAll();
386
    }
387
388
    /**
389
     * Get basic wikidata on the page: label and description.
390
     * @param Page $page
391
     * @return string[] In the format:
392
     *    [[
393
     *         'term' => string such as 'label',
394
     *         'term_text' => string (value for 'label'),
395
     *     ], ... ]
396
     */
397
    public function getWikidataInfo(Page $page)
398
    {
399
        if (empty($page->getWikidataId())) {
400
            return [];
401
        }
402
403
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
404
        $lang = $page->getProject()->getLang();
405
406
        $sql = "SELECT term_type AS term, term_text
407
                FROM wikidatawiki_p.wb_terms
408
                WHERE term_entity_id = :wikidataId
409
                AND term_type IN ('label', 'description')
410
                AND term_language = :lang";
411
412
        $resultQuery = $this->getProjectsConnection()->prepare($sql);
413
        $resultQuery->bindParam(':lang', $lang);
414
        $resultQuery->bindParam(':wikidataId', $wikidataId);
415
        $resultQuery->execute();
416
417
        return $resultQuery->fetchAll();
418
    }
419
420
    /**
421
     * Get or count all wikidata items for the given page,
422
     *     not just languages of sister projects
423
     * @param Page $page
424
     * @param bool $count Set to true to get only a COUNT
425
     * @return string[]|int Records as returend by the DB,
426
     *                      or raw COUNT of the records.
427
     */
428
    public function getWikidataItems(Page $page, $count = false)
429
    {
430
        if (!$page->getWikidataId()) {
431
            return $count ? 0 : [];
432
        }
433
434
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
435
436
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
437
                FROM wikidatawiki_p.wb_items_per_site
438
                WHERE ips_item_id = :wikidataId";
439
440
        $resultQuery = $this->getProjectsConnection()->prepare($sql);
441
        $resultQuery->bindParam(':wikidataId', $wikidataId);
442
        $resultQuery->execute();
443
444
        $result = $resultQuery->fetchAll();
445
446
        return $count ? (int) $result[0]['count'] : $result;
447
    }
448
449
    /**
450
     * Get number of in and outgoing links and redirects to the given page.
451
     * @param Page $page
452
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
453
     *                  'links_in_count' and 'redirects_count'
454
     */
455
    public function countLinksAndRedirects(Page $page)
456
    {
457
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
458
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
459
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
460
461
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
462
                FROM $externalLinksTable WHERE el_from = :id
463
                UNION
464
                SELECT COUNT(*) AS value, 'links_out' AS type
465
                FROM $pageLinksTable WHERE pl_from = :id
466
                UNION
467
                SELECT COUNT(*) AS value, 'links_in' AS type
468
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
469
                UNION
470
                SELECT COUNT(*) AS value, 'redirects' AS type
471
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
472
473
        $params = [
474
            'id' => $page->getId(),
475
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
476
            'namespace' => $page->getNamespace(),
477
        ];
478
479
        $conn = $this->getProjectsConnection();
480
        $res = $conn->executeQuery($sql, $params);
481
482
        $data = [];
483
484
        // Transform to associative array by 'type'
485
        foreach ($res as $row) {
486
            $data[$row['type'] . '_count'] = $row['value'];
487
        }
488
489
        return $data;
490
    }
491
492
    /**
493
     * Count wikidata items for the given page, not just languages of sister projects
494
     * @param Page $page
495
     * @return int Number of records.
496
     */
497
    public function countWikidataItems(Page $page)
498
    {
499
        return $this->getWikidataItems($page, true);
500
    }
501
502
    /**
503
     * Get page views for the given page and timeframe.
504
     * @FIXME use Symfony Guzzle package.
505
     * @param Page $page
506
     * @param string|DateTime $start In the format YYYYMMDD
507
     * @param string|DateTime $end In the format YYYYMMDD
508
     * @return string[]
509
     */
510
    public function getPageviews(Page $page, $start, $end)
511
    {
512
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
513
        $client = new GuzzleHttp\Client();
514
515
        if ($start instanceof DateTime) {
516
            $start = $start->format('Ymd');
517
        } else {
518
            $start = (new DateTime($start))->format('Ymd');
519
        }
520
        if ($end instanceof DateTime) {
521
            $end = $end->format('Ymd');
522
        } else {
523
            $end = (new DateTime($end))->format('Ymd');
524
        }
525
526
        $project = $page->getProject()->getDomain();
527
528
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
529
            "$project/all-access/user/$title/daily/$start/$end";
530
531
        $res = $client->request('GET', $url);
532
        return json_decode($res->getBody()->getContents(), true);
533
    }
534
535
    /**
536
     * Get the full HTML content of the the page.
537
     * @param  Page $page
538
     * @param  int $revId What revision to query for.
539
     * @return string
540
     */
541
    public function getHTMLContent(Page $page, $revId = null)
542
    {
543
        $client = new GuzzleHttp\Client();
544
        $url = $page->getUrl();
545
        if ($revId !== null) {
546
            $url .= "?oldid=$revId";
547
        }
548
        return $client->request('GET', $url)
549
            ->getBody()
550
            ->getContents();
551
    }
552
553
    /**
554
     * Get the ID of the revision of a page at the time of the given DateTime.
555
     * @param  Page     $page
556
     * @param  DateTime $date
557
     * @return int
558
     */
559
    public function getRevisionIdAtDate(Page $page, DateTime $date)
560
    {
561
        $revisionTable = $page->getProject()->getTableName('revision');
562
        $pageId = $page->getId();
563
        $datestamp = $date->format('YmdHis');
564
        $sql = "SELECT MAX(rev_id)
565
                FROM $revisionTable
566
                WHERE rev_timestamp <= $datestamp
567
                AND rev_page = $pageId LIMIT 1;";
568
        $resultQuery = $this->getProjectsConnection()->query($sql);
569
        return (int)$resultQuery->fetchColumn();
570
    }
571
}
572