Passed
Push — master ( 9b6f1b...510c7f )
by MusikAnimal
04:53
created

PageRepository::countWikidataItems()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
namespace Xtools;
7
8
use DateTime;
9
use Mediawiki\Api\SimpleRequest;
10
use GuzzleHttp;
11
12
/**
13
 * A PageRepository fetches data about Pages, either singularly or for multiple.
14
 * Despite the name, this does not have a direct correlation with the Pages tool.
15
 * @codeCoverageIgnore
16
 */
17
class PageRepository extends Repository
18
{
19
20
    /**
21
     * Get metadata about a single page from the API.
22
     * @param Project $project The project to which the page belongs.
23
     * @param string $pageTitle Page title.
24
     * @return string[] Array with some of the following keys: pageid, title, missing, displaytitle,
25
     * url.
26
     */
27
    public function getPageInfo(Project $project, $pageTitle)
28
    {
29
        $info = $this->getPagesInfo($project, [$pageTitle]);
30
        return array_shift($info);
31
    }
32
33
    /**
34
     * Get metadata about a set of pages from the API.
35
     * @param Project $project The project to which the pages belong.
36
     * @param string[] $pageTitles Array of page titles.
37
     * @return string[] Array keyed by the page names, each element with some of the
38
     * following keys: pageid, title, missing, displaytitle, url.
39
     */
40
    public function getPagesInfo(Project $project, $pageTitles)
41
    {
42
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
43
        $params = [
44
            'prop' => 'info|pageprops',
45
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
46
            'converttitles' => '',
47
            // 'ellimit' => 20,
48
            // 'elexpandurl' => '',
49
            'titles' => join('|', $pageTitles),
50
            'formatversion' => 2
51
            // 'pageids' => $pageIds // FIXME: allow page IDs
52
        ];
53
54
        $query = new SimpleRequest('query', $params);
55
        $api = $this->getMediawikiApi($project);
56
        $res = $api->getRequest($query);
57
        $result = [];
58
        if (isset($res['query']['pages'])) {
59
            foreach ($res['query']['pages'] as $pageInfo) {
60
                $result[$pageInfo['title']] = $pageInfo;
61
            }
62
        }
63
        return $result;
64
    }
65
66
    /**
67
     * Get the full page text of a set of pages.
68
     * @param Project $project The project to which the pages belong.
69
     * @param string[] $pageTitles Array of page titles.
70
     * @return string[] Array keyed by the page names, with the page text as the values.
71
     */
72
    public function getPagesWikitext(Project $project, $pageTitles)
73
    {
74
        $query = new SimpleRequest('query', [
75
            'prop' => 'revisions',
76
            'rvprop' => 'content',
77
            'titles' => join('|', $pageTitles),
78
            'formatversion' => 2,
79
        ]);
80
        $result = [];
81
82
        $api = $this->getMediawikiApi($project);
83
        $res = $api->getRequest($query);
84
85
        if (!isset($res['query']['pages'])) {
86
            return [];
87
        }
88
89
        foreach ($res['query']['pages'] as $page) {
90
            if (isset($page['revisions'][0]['content'])) {
91
                $result[$page['title']] = $page['revisions'][0]['content'];
92
            } else {
93
                $result[$page['title']] = '';
94
            }
95
        }
96
97
        return $result;
98
    }
99
100
    /**
101
     * Get revisions of a single page.
102
     * @param Page $page The page.
103
     * @param User|null $user Specify to get only revisions by the given user.
104
     * @param false|int $start
105
     * @param false|int $end
106
     * @return string[] Each member with keys: id, timestamp, length.
107
     */
108
    public function getRevisions(Page $page, User $user = null, $start = false, $end = false)
109
    {
110
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
111
        if ($this->cache->hasItem($cacheKey)) {
112
            return $this->cache->getItem($cacheKey)->get();
113
        }
114
115
        $this->stopwatch->start($cacheKey, 'XTools');
116
117
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
118
        $result = $stmt->fetchAll();
119
120
        // Cache and return.
121
        $this->stopwatch->stop($cacheKey);
122
        return $this->setCache($cacheKey, $result);
123
    }
124
125
    /**
126
     * Get the statement for a single revision, so that you can iterate row by row.
127
     * @param Page $page The page.
128
     * @param User|null $user Specify to get only revisions by the given user.
129
     * @param int $limit Max number of revisions to process.
130
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
131
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
132
     *   a separate query is ran to get the nuber of revisions.
133
     * @param false|int $start
134
     * @param false|int $end
135
     * @return Doctrine\DBAL\Driver\PDOStatement
0 ignored issues
show
Bug introduced by
The type Xtools\Doctrine\DBAL\Driver\PDOStatement was not found. Did you mean Doctrine\DBAL\Driver\PDOStatement? If so, make sure to prefix the type with \.
Loading history...
136
     */
137
    public function getRevisionsStmt(
138
        Page $page,
139
        User $user = null,
140
        $limit = null,
141
        $numRevisions = null,
142
        $start = false,
143
        $end = false
144
    ) {
145
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
146
        $userClause = $user ? "revs.rev_user_text = :username AND " : "";
147
148
        // This sorts ascending by rev_timestamp because ArticleInfo must start with the oldest
149
        // revision and work its way forward for proper processing. Consequently, if we want to do
150
        // a LIMIT we want the most recent revisions, so we also need to know the total count to
151
        // supply as the OFFSET.
152
        $limitClause = '';
153
        if (intval($limit) > 0 && isset($numRevisions)) {
154
            $offset = $numRevisions - $limit;
155
            $limitClause = "LIMIT $offset, $limit";
156
        }
157
158
        $datesConditions = $this->getDateConditions($start, $end, 'revs.');
159
160
        $sql = "SELECT
161
                    revs.rev_id AS id,
162
                    revs.rev_timestamp AS timestamp,
163
                    revs.rev_minor_edit AS minor,
164
                    revs.rev_len AS length,
165
                    (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
166
                    revs.rev_user AS user_id,
167
                    revs.rev_user_text AS username,
168
                    revs.rev_comment AS comment,
169
                    revs.rev_sha1 AS sha
170
                FROM $revTable AS revs
171
                LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
172
                WHERE $userClause revs.rev_page = :pageid $datesConditions
173
                ORDER BY revs.rev_timestamp ASC
174
                $limitClause";
175
176
        $params = ['pageid' => $page->getId()];
177
        if ($user) {
178
            $params['username'] = $user->getUsername();
179
        }
180
181
        return $this->executeProjectsQuery($sql, $params);
182
    }
183
184
    /**
185
     * Get a count of the number of revisions of a single page
186
     * @param Page $page The page.
187
     * @param User|null $user Specify to only count revisions by the given user.
188
     * @param false|int $start
189
     * @param false|int $end
190
     * @return int
191
     */
192
    public function getNumRevisions(Page $page, User $user = null, $start = false, $end = false)
193
    {
194
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
195
        if ($this->cache->hasItem($cacheKey)) {
196
            return $this->cache->getItem($cacheKey)->get();
197
        }
198
199
        $revTable = $page->getProject()->getTableName('revision');
200
        $userClause = $user ? "rev_user_text = :username AND " : "";
201
202
        $datesConditions = $this->getDateConditions($start, $end);
203
204
        $sql = "SELECT COUNT(*)
205
                FROM $revTable
206
                WHERE $userClause rev_page = :pageid $datesConditions";
207
        $params = ['pageid' => $page->getId()];
208
        if ($user) {
209
            $params['username'] = $user->getUsername();
210
        }
211
212
        $result = $this->executeProjectsQuery($sql, $params)->fetchColumn(0);
213
214
        // Cache and return.
215
        return $this->setCache($cacheKey, $result);
216
    }
217
218
    /**
219
     * Get various basic info used in the API, including the
220
     *   number of revisions, unique authors, initial author
221
     *   and edit count of the initial author.
222
     * This is combined into one query for better performance.
223
     * Caching is only applied if it took considerable time to process,
224
     *   because using the gadget, this will get hit for a different page
225
     *   constantly, where the likelihood of cache benefiting us is slim.
226
     * @param Page $page The page.
227
     * @return string[]
228
     */
229
    public function getBasicEditingInfo(Page $page)
230
    {
231
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_basicinfo');
232
        if ($this->cache->hasItem($cacheKey)) {
233
            return $this->cache->getItem($cacheKey)->get();
234
        }
235
236
        /**
237
         * This query can sometimes take too long to run for pages with tens of thousands
238
         * of revisions. This query is used by the ArticleInfo gadget, which shows basic
239
         * data in real-time, so if it takes too long than the user probably didn't even
240
         * wait to see the result. We'll utilize the max_statement_time variable to set
241
         * a maximum query time of 60 seconds.
242
         */
243
        $sql = "SET max_statement_time = 60;";
244
        $this->executeProjectsQuery($sql);
245
246
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
247
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
248
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
249
250
        $sql = "SELECT *, (
251
                   SELECT user_editcount
252
                   FROM $userTable
253
                   WHERE user_name = author
254
                ) AS author_editcount
255
                FROM (
256
                    (
257
                        SELECT COUNT(*) AS num_edits,
258
                               COUNT(DISTINCT(rev_user_text)) AS num_editors
259
                        FROM $revTable
260
                        WHERE rev_page = :pageid
261
                    ) a,
262
                    (
263
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
264
                        #   and the lowest rev_id may not be the first revision.
265
                        SELECT rev_user_text AS author,
266
                               rev_timestamp AS created_at,
267
                               rev_id AS created_rev_id
268
                        FROM $revTable
269
                        WHERE rev_page = :pageid
270
                        ORDER BY rev_timestamp ASC
271
                        LIMIT 1
272
                    ) b,
273
                    (
274
                        SELECT MAX(rev_timestamp) AS modified_at
275
                        FROM $revTable
276
                        WHERE rev_page = :pageid
277
                    ) c,
278
                    (
279
                        SELECT page_latest AS modified_rev_id
280
                        FROM $pageTable
281
                        WHERE page_id = :pageid
282
                    ) d
283
                );";
284
        $params = ['pageid' => $page->getId()];
285
286
        // Get current time so we can compare timestamps
287
        // and decide whether or to cache the result.
288
        $time1 = time();
289
        $result = $this->executeProjectsQuery($sql, $params)->fetch();
290
        $time2 = time();
291
292
        // If it took over 5 seconds, cache the result for 20 minutes.
293
        if ($time2 - $time1 > 5) {
294
            $this->setCache($cacheKey, $result, 'PT20M');
295
        }
296
297
        return $result;
298
    }
299
300
    /**
301
     * Get assessment data for the given pages
302
     * @param Project   $project The project to which the pages belong.
303
     * @param  int[]    $pageIds Page IDs
304
     * @return string[] Assessment data as retrieved from the database.
305
     */
306
    public function getAssessments(Project $project, $pageIds)
307
    {
308
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_assessments');
309
        if ($this->cache->hasItem($cacheKey)) {
310
            return $this->cache->getItem($cacheKey)->get();
311
        }
312
313
        if (!$project->hasPageAssessments()) {
314
            return [];
315
        }
316
        $paTable = $this->getTableName($project->getDatabaseName(), 'page_assessments');
317
        $papTable = $this->getTableName($project->getDatabaseName(), 'page_assessments_projects');
318
        $pageIds = implode($pageIds, ',');
0 ignored issues
show
Bug introduced by
',' of type string is incompatible with the type array expected by parameter $pieces of implode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

318
        $pageIds = implode($pageIds, /** @scrutinizer ignore-type */ ',');
Loading history...
319
320
        $sql = "SELECT pap_project_title AS wikiproject, pa_class AS class, pa_importance AS importance
321
                FROM $paTable
322
                LEFT JOIN $papTable ON pa_project_id = pap_project_id
323
                WHERE pa_page_id IN ($pageIds)";
324
325
        $result = $this->executeProjectsQuery($sql)->fetchAll();
326
327
        // Cache and return.
328
        return $this->setCache($cacheKey, $result);
329
    }
330
331
    /**
332
     * Get any CheckWiki errors of a single page
333
     * @param Page $page
334
     * @return array Results from query
335
     */
336
    public function getCheckWikiErrors(Page $page)
337
    {
338
        // Only support mainspace on Labs installations
339
        if ($page->getNamespace() !== 0 || !$this->isLabs()) {
340
            return [];
341
        }
342
343
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
344
                FROM s51080__checkwiki_p.cw_error a
345
                JOIN s51080__checkwiki_p.cw_overview_errors b
346
                WHERE a.project = b.project
347
                AND a.project = :dbName
348
                AND a.title = :title
349
                AND a.error = b.id
350
                AND a.ok = 0";
351
352
        // remove _p if present
353
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
354
355
        // Page title without underscores (str_replace just to be sure)
356
        $pageTitle = str_replace('_', ' ', $page->getTitle());
357
358
        $resultQuery = $this->getToolsConnection()->prepare($sql);
359
        $resultQuery->bindParam(':dbName', $dbName);
360
        $resultQuery->bindParam(':title', $pageTitle);
361
        $resultQuery->execute();
362
363
        return $resultQuery->fetchAll();
364
    }
365
366
    /**
367
     * Get basic wikidata on the page: label and description.
368
     * @param Page $page
369
     * @return string[] In the format:
370
     *    [[
371
     *         'term' => string such as 'label',
372
     *         'term_text' => string (value for 'label'),
373
     *     ], ... ]
374
     */
375
    public function getWikidataInfo(Page $page)
376
    {
377
        if (empty($page->getWikidataId())) {
378
            return [];
379
        }
380
381
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
382
        $lang = $page->getProject()->getLang();
383
384
        $sql = "SELECT term_type AS term, term_text
385
                FROM wikidatawiki_p.wb_terms
386
                WHERE term_entity_id = :wikidataId
387
                AND term_type IN ('label', 'description')
388
                AND term_language = :lang";
389
390
        return $this->executeProjectsQuery($sql, [
391
            'lang' => $lang,
392
            'wikidataId' => $wikidataId,
393
        ])->fetchAll();
394
    }
395
396
    /**
397
     * Get or count all wikidata items for the given page,
398
     *     not just languages of sister projects
399
     * @param Page $page
400
     * @param bool $count Set to true to get only a COUNT
401
     * @return string[]|int Records as returend by the DB,
402
     *                      or raw COUNT of the records.
403
     */
404
    public function getWikidataItems(Page $page, $count = false)
405
    {
406
        if (!$page->getWikidataId()) {
407
            return $count ? 0 : [];
408
        }
409
410
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
411
412
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
413
                FROM wikidatawiki_p.wb_items_per_site
414
                WHERE ips_item_id = :wikidataId";
415
416
        $result = $this->executeProjectsQuery($sql, [
417
            'wikidataId' => $wikidataId,
418
        ])->fetchAll();
419
420
        return $count ? (int) $result[0]['count'] : $result;
421
    }
422
423
    /**
424
     * Get number of in and outgoing links and redirects to the given page.
425
     * @param Page $page
426
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
427
     *                  'links_in_count' and 'redirects_count'
428
     */
429
    public function countLinksAndRedirects(Page $page)
430
    {
431
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
432
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
433
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
434
435
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
436
                FROM $externalLinksTable WHERE el_from = :id
437
                UNION
438
                SELECT COUNT(*) AS value, 'links_out' AS type
439
                FROM $pageLinksTable WHERE pl_from = :id
440
                UNION
441
                SELECT COUNT(*) AS value, 'links_in' AS type
442
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
443
                UNION
444
                SELECT COUNT(*) AS value, 'redirects' AS type
445
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
446
447
        $params = [
448
            'id' => $page->getId(),
449
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
450
            'namespace' => $page->getNamespace(),
451
        ];
452
453
        $res = $this->executeProjectsQuery($sql, $params);
454
        $data = [];
455
456
        // Transform to associative array by 'type'
457
        foreach ($res as $row) {
458
            $data[$row['type'] . '_count'] = $row['value'];
459
        }
460
461
        return $data;
462
    }
463
464
    /**
465
     * Count wikidata items for the given page, not just languages of sister projects
466
     * @param Page $page
467
     * @return int Number of records.
468
     */
469
    public function countWikidataItems(Page $page)
470
    {
471
        return $this->getWikidataItems($page, true);
472
    }
473
474
    /**
475
     * Get page views for the given page and timeframe.
476
     * @FIXME use Symfony Guzzle package.
477
     * @param Page $page
478
     * @param string|DateTime $start In the format YYYYMMDD
479
     * @param string|DateTime $end In the format YYYYMMDD
480
     * @return string[]
481
     */
482
    public function getPageviews(Page $page, $start, $end)
483
    {
484
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
485
        $client = new GuzzleHttp\Client();
486
487
        if ($start instanceof DateTime) {
488
            $start = $start->format('Ymd');
489
        } else {
490
            $start = (new DateTime($start))->format('Ymd');
491
        }
492
        if ($end instanceof DateTime) {
493
            $end = $end->format('Ymd');
494
        } else {
495
            $end = (new DateTime($end))->format('Ymd');
496
        }
497
498
        $project = $page->getProject()->getDomain();
499
500
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
501
            "$project/all-access/user/$title/daily/$start/$end";
502
503
        $res = $client->request('GET', $url);
504
        return json_decode($res->getBody()->getContents(), true);
505
    }
506
507
    /**
508
     * Get the full HTML content of the the page.
509
     * @param  Page $page
510
     * @param  int $revId What revision to query for.
511
     * @return string
512
     */
513
    public function getHTMLContent(Page $page, $revId = null)
514
    {
515
        $client = new GuzzleHttp\Client();
516
        $url = $page->getUrl();
517
        if ($revId !== null) {
518
            $url .= "?oldid=$revId";
519
        }
520
        return $client->request('GET', $url)
521
            ->getBody()
522
            ->getContents();
523
    }
524
525
    /**
526
     * Get the ID of the revision of a page at the time of the given DateTime.
527
     * @param  Page     $page
528
     * @param  DateTime $date
529
     * @return int
530
     */
531
    public function getRevisionIdAtDate(Page $page, DateTime $date)
532
    {
533
        $revisionTable = $page->getProject()->getTableName('revision');
534
        $pageId = $page->getId();
535
        $datestamp = $date->format('YmdHis');
536
        $sql = "SELECT MAX(rev_id)
537
                FROM $revisionTable
538
                WHERE rev_timestamp <= $datestamp
539
                AND rev_page = $pageId LIMIT 1;";
540
        $resultQuery = $this->getProjectsConnection()->query($sql);
541
        return (int)$resultQuery->fetchColumn();
542
    }
543
}
544