Passed
Push — master ( becd39...b8b862 )
by MusikAnimal
05:00
created

PageRepository   B

Complexity

Total Complexity 50

Size/Duplication

Total Lines 573
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
dl 0
loc 573
rs 8.6206
c 0
b 0
f 0
wmc 50

17 Methods

Rating   Name   Duplication   Size   Complexity  
B getPagesWikitext() 0 26 4
A getPageInfo() 0 4 1
A countWikidataItems() 0 3 1
B getPagesInfo() 0 24 3
B getNumRevisions() 0 24 4
B getRevisionsStmt() 0 45 5
A getRevisions() 0 15 2
A getPageviews() 0 23 3
A getWikidataInfo() 0 19 2
B getCheckWikiErrors() 0 28 3
A getHTMLContent() 0 10 2
B getWikidataItems() 0 17 5
B countLinksAndRedirects() 0 33 2
A getRevisionIdAtDate() 0 11 1
A getAssessments() 0 23 3
A getBasicEditingInfo() 0 68 3
B displayTitles() 0 39 6

How to fix   Complexity   

Complex Class

Complex classes like PageRepository often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use PageRepository, and based on these observations, apply Extract Interface, too.

1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
namespace Xtools;
7
8
use DateTime;
9
use Mediawiki\Api\SimpleRequest;
10
use GuzzleHttp;
11
12
/**
13
 * A PageRepository fetches data about Pages, either singularly or for multiple.
14
 * Despite the name, this does not have a direct correlation with the Pages tool.
15
 * @codeCoverageIgnore
16
 */
17
class PageRepository extends Repository
18
{
19
20
    /**
21
     * Get metadata about a single page from the API.
22
     * @param Project $project The project to which the page belongs.
23
     * @param string $pageTitle Page title.
24
     * @return string[] Array with some of the following keys: pageid, title, missing, displaytitle,
25
     * url.
26
     */
27
    public function getPageInfo(Project $project, $pageTitle)
28
    {
29
        $info = $this->getPagesInfo($project, [$pageTitle]);
30
        return array_shift($info);
31
    }
32
33
    /**
34
     * Get metadata about a set of pages from the API.
35
     * @param Project $project The project to which the pages belong.
36
     * @param string[] $pageTitles Array of page titles.
37
     * @return string[] Array keyed by the page names, each element with some of the
38
     * following keys: pageid, title, missing, displaytitle, url.
39
     */
40
    public function getPagesInfo(Project $project, $pageTitles)
41
    {
42
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
43
        $params = [
44
            'prop' => 'info|pageprops',
45
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
46
            'converttitles' => '',
47
            // 'ellimit' => 20,
48
            // 'elexpandurl' => '',
49
            'titles' => join('|', $pageTitles),
50
            'formatversion' => 2
51
            // 'pageids' => $pageIds // FIXME: allow page IDs
52
        ];
53
54
        $query = new SimpleRequest('query', $params);
55
        $api = $this->getMediawikiApi($project);
56
        $res = $api->getRequest($query);
57
        $result = [];
58
        if (isset($res['query']['pages'])) {
59
            foreach ($res['query']['pages'] as $pageInfo) {
60
                $result[$pageInfo['title']] = $pageInfo;
61
            }
62
        }
63
        return $result;
64
    }
65
66
    /**
67
     * Get the full page text of a set of pages.
68
     * @param Project $project The project to which the pages belong.
69
     * @param string[] $pageTitles Array of page titles.
70
     * @return string[] Array keyed by the page names, with the page text as the values.
71
     */
72
    public function getPagesWikitext(Project $project, $pageTitles)
73
    {
74
        $query = new SimpleRequest('query', [
75
            'prop' => 'revisions',
76
            'rvprop' => 'content',
77
            'titles' => join('|', $pageTitles),
78
            'formatversion' => 2,
79
        ]);
80
        $result = [];
81
82
        $api = $this->getMediawikiApi($project);
83
        $res = $api->getRequest($query);
84
85
        if (!isset($res['query']['pages'])) {
86
            return [];
87
        }
88
89
        foreach ($res['query']['pages'] as $page) {
90
            if (isset($page['revisions'][0]['content'])) {
91
                $result[$page['title']] = $page['revisions'][0]['content'];
92
            } else {
93
                $result[$page['title']] = '';
94
            }
95
        }
96
97
        return $result;
98
    }
99
100
    /**
101
     * Get revisions of a single page.
102
     * @param Page $page The page.
103
     * @param User|null $user Specify to get only revisions by the given user.
104
     * @param false|int $start
105
     * @param false|int $end
106
     * @return string[] Each member with keys: id, timestamp, length.
107
     */
108
    public function getRevisions(Page $page, User $user = null, $start = false, $end = false)
109
    {
110
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
111
        if ($this->cache->hasItem($cacheKey)) {
112
            return $this->cache->getItem($cacheKey)->get();
113
        }
114
115
        $this->stopwatch->start($cacheKey, 'XTools');
116
117
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
118
        $result = $stmt->fetchAll();
119
120
        // Cache and return.
121
        $this->stopwatch->stop($cacheKey);
122
        return $this->setCache($cacheKey, $result);
123
    }
124
125
    /**
126
     * Get the statement for a single revision, so that you can iterate row by row.
127
     * @param Page $page The page.
128
     * @param User|null $user Specify to get only revisions by the given user.
129
     * @param int $limit Max number of revisions to process.
130
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
131
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
132
     *   a separate query is ran to get the nuber of revisions.
133
     * @param false|int $start
134
     * @param false|int $end
135
     * @return Doctrine\DBAL\Driver\PDOStatement
0 ignored issues
show
Bug introduced by
The type Xtools\Doctrine\DBAL\Driver\PDOStatement was not found. Did you mean Doctrine\DBAL\Driver\PDOStatement? If so, make sure to prefix the type with \.
Loading history...
136
     */
137
    public function getRevisionsStmt(
138
        Page $page,
139
        User $user = null,
140
        $limit = null,
141
        $numRevisions = null,
142
        $start = false,
143
        $end = false
144
    ) {
145
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
146
        $userClause = $user ? "revs.rev_user_text = :username AND " : "";
147
148
        // This sorts ascending by rev_timestamp because ArticleInfo must start with the oldest
149
        // revision and work its way forward for proper processing. Consequently, if we want to do
150
        // a LIMIT we want the most recent revisions, so we also need to know the total count to
151
        // supply as the OFFSET.
152
        $limitClause = '';
153
        if (intval($limit) > 0 && isset($numRevisions)) {
154
            $offset = $numRevisions - $limit;
155
            $limitClause = "LIMIT $offset, $limit";
156
        }
157
158
        $datesConditions = $this->getDateConditions($start, $end, 'revs.');
159
160
        $sql = "SELECT
161
                    revs.rev_id AS id,
162
                    revs.rev_timestamp AS timestamp,
163
                    revs.rev_minor_edit AS minor,
164
                    revs.rev_len AS length,
165
                    (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
166
                    revs.rev_user AS user_id,
167
                    revs.rev_user_text AS username,
168
                    revs.rev_comment AS comment,
169
                    revs.rev_sha1 AS sha
170
                FROM $revTable AS revs
171
                LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
172
                WHERE $userClause revs.rev_page = :pageid $datesConditions
173
                ORDER BY revs.rev_timestamp ASC
174
                $limitClause";
175
176
        $params = ['pageid' => $page->getId()];
177
        if ($user) {
178
            $params['username'] = $user->getUsername();
179
        }
180
181
        return $this->executeProjectsQuery($sql, $params);
182
    }
183
184
    /**
185
     * Get a count of the number of revisions of a single page
186
     * @param Page $page The page.
187
     * @param User|null $user Specify to only count revisions by the given user.
188
     * @param false|int $start
189
     * @param false|int $end
190
     * @return int
191
     */
192
    public function getNumRevisions(Page $page, User $user = null, $start = false, $end = false)
193
    {
194
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
195
        if ($this->cache->hasItem($cacheKey)) {
196
            return $this->cache->getItem($cacheKey)->get();
197
        }
198
199
        $revTable = $page->getProject()->getTableName('revision');
200
        $userClause = $user ? "rev_user_text = :username AND " : "";
201
202
        $datesConditions = $this->getDateConditions($start, $end);
203
204
        $sql = "SELECT COUNT(*)
205
                FROM $revTable
206
                WHERE $userClause rev_page = :pageid $datesConditions";
207
        $params = ['pageid' => $page->getId()];
208
        if ($user) {
209
            $params['username'] = $user->getUsername();
210
        }
211
212
        $result = $this->executeProjectsQuery($sql, $params)->fetchColumn(0);
213
214
        // Cache and return.
215
        return $this->setCache($cacheKey, $result);
216
    }
217
218
    /**
219
     * Get various basic info used in the API, including the
220
     *   number of revisions, unique authors, initial author
221
     *   and edit count of the initial author.
222
     * This is combined into one query for better performance.
223
     * Caching is only applied if it took considerable time to process,
224
     *   because using the gadget, this will get hit for a different page
225
     *   constantly, where the likelihood of cache benefiting us is slim.
226
     * @param Page $page The page.
227
     * @return string[]
228
     */
229
    public function getBasicEditingInfo(Page $page)
230
    {
231
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_basicinfo');
232
        if ($this->cache->hasItem($cacheKey)) {
233
            return $this->cache->getItem($cacheKey)->get();
234
        }
235
236
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
237
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
238
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
239
240
        $sql = "SELECT *, (
241
                   SELECT user_editcount
242
                   FROM $userTable
243
                   WHERE user_name = author
244
                ) AS author_editcount
245
                FROM (
246
                    (
247
                        SELECT COUNT(*) AS num_edits,
248
                               COUNT(DISTINCT(rev_user_text)) AS num_editors
249
                        FROM $revTable
250
                        WHERE rev_page = :pageid
251
                    ) a,
252
                    (
253
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
254
                        #   and the lowest rev_id may not be the first revision.
255
                        SELECT rev_user_text AS author,
256
                               rev_timestamp AS created_at,
257
                               rev_id AS created_rev_id
258
                        FROM $revTable
259
                        WHERE rev_page = :pageid
260
                        ORDER BY rev_timestamp ASC
261
                        LIMIT 1
262
                    ) b,
263
                    (
264
                        SELECT MAX(rev_timestamp) AS modified_at
265
                        FROM $revTable
266
                        WHERE rev_page = :pageid
267
                    ) c,
268
                    (
269
                        SELECT page_latest AS modified_rev_id
270
                        FROM $pageTable
271
                        WHERE page_id = :pageid
272
                    ) d
273
                );";
274
        $params = ['pageid' => $page->getId()];
275
276
        // Get current time so we can compare timestamps
277
        // and decide whether or to cache the result.
278
        $time1 = time();
279
280
        /**
281
         * This query can sometimes take too long to run for pages with tens of thousands
282
         * of revisions. This query is used by the ArticleInfo gadget, which shows basic
283
         * data in real-time, so if it takes too long than the user probably didn't even
284
         * wait to see the result. We'll pass 60 as the last parameter to executeProjectsQuery,
285
         * which will set the max_statement_time to 60 seconds.
286
         */
287
        $result = $this->executeProjectsQuery($sql, $params, 60)->fetch();
288
289
        $time2 = time();
290
291
        // If it took over 5 seconds, cache the result for 20 minutes.
292
        if ($time2 - $time1 > 5) {
293
            $this->setCache($cacheKey, $result, 'PT20M');
294
        }
295
296
        return $result;
297
    }
298
299
    /**
300
     * Get assessment data for the given pages
301
     * @param Project   $project The project to which the pages belong.
302
     * @param  int[]    $pageIds Page IDs
303
     * @return string[] Assessment data as retrieved from the database.
304
     */
305
    public function getAssessments(Project $project, $pageIds)
306
    {
307
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_assessments');
308
        if ($this->cache->hasItem($cacheKey)) {
309
            return $this->cache->getItem($cacheKey)->get();
310
        }
311
312
        if (!$project->hasPageAssessments()) {
313
            return [];
314
        }
315
        $paTable = $this->getTableName($project->getDatabaseName(), 'page_assessments');
316
        $papTable = $this->getTableName($project->getDatabaseName(), 'page_assessments_projects');
317
        $pageIds = implode($pageIds, ',');
0 ignored issues
show
Bug introduced by
',' of type string is incompatible with the type array expected by parameter $pieces of implode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

317
        $pageIds = implode($pageIds, /** @scrutinizer ignore-type */ ',');
Loading history...
318
319
        $sql = "SELECT pap_project_title AS wikiproject, pa_class AS class, pa_importance AS importance
320
                FROM $paTable
321
                LEFT JOIN $papTable ON pa_project_id = pap_project_id
322
                WHERE pa_page_id IN ($pageIds)";
323
324
        $result = $this->executeProjectsQuery($sql)->fetchAll();
325
326
        // Cache and return.
327
        return $this->setCache($cacheKey, $result);
328
    }
329
330
    /**
331
     * Get any CheckWiki errors of a single page
332
     * @param Page $page
333
     * @return array Results from query
334
     */
335
    public function getCheckWikiErrors(Page $page)
336
    {
337
        // Only support mainspace on Labs installations
338
        if ($page->getNamespace() !== 0 || !$this->isLabs()) {
339
            return [];
340
        }
341
342
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
343
                FROM s51080__checkwiki_p.cw_error a
344
                JOIN s51080__checkwiki_p.cw_overview_errors b
345
                WHERE a.project = b.project
346
                AND a.project = :dbName
347
                AND a.title = :title
348
                AND a.error = b.id
349
                AND a.ok = 0";
350
351
        // remove _p if present
352
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
353
354
        // Page title without underscores (str_replace just to be sure)
355
        $pageTitle = str_replace('_', ' ', $page->getTitle());
356
357
        $resultQuery = $this->getToolsConnection()->prepare($sql);
358
        $resultQuery->bindParam(':dbName', $dbName);
359
        $resultQuery->bindParam(':title', $pageTitle);
360
        $resultQuery->execute();
361
362
        return $resultQuery->fetchAll();
363
    }
364
365
    /**
366
     * Get basic wikidata on the page: label and description.
367
     * @param Page $page
368
     * @return string[] In the format:
369
     *    [[
370
     *         'term' => string such as 'label',
371
     *         'term_text' => string (value for 'label'),
372
     *     ], ... ]
373
     */
374
    public function getWikidataInfo(Page $page)
375
    {
376
        if (empty($page->getWikidataId())) {
377
            return [];
378
        }
379
380
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
381
        $lang = $page->getProject()->getLang();
382
383
        $sql = "SELECT term_type AS term, term_text
384
                FROM wikidatawiki_p.wb_terms
385
                WHERE term_entity_id = :wikidataId
386
                AND term_type IN ('label', 'description')
387
                AND term_language = :lang";
388
389
        return $this->executeProjectsQuery($sql, [
390
            'lang' => $lang,
391
            'wikidataId' => $wikidataId,
392
        ])->fetchAll();
393
    }
394
395
    /**
396
     * Get or count all wikidata items for the given page,
397
     *     not just languages of sister projects
398
     * @param Page $page
399
     * @param bool $count Set to true to get only a COUNT
400
     * @return string[]|int Records as returend by the DB,
401
     *                      or raw COUNT of the records.
402
     */
403
    public function getWikidataItems(Page $page, $count = false)
404
    {
405
        if (!$page->getWikidataId()) {
406
            return $count ? 0 : [];
407
        }
408
409
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
410
411
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
412
                FROM wikidatawiki_p.wb_items_per_site
413
                WHERE ips_item_id = :wikidataId";
414
415
        $result = $this->executeProjectsQuery($sql, [
416
            'wikidataId' => $wikidataId,
417
        ])->fetchAll();
418
419
        return $count ? (int) $result[0]['count'] : $result;
420
    }
421
422
    /**
423
     * Get number of in and outgoing links and redirects to the given page.
424
     * @param Page $page
425
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
426
     *                  'links_in_count' and 'redirects_count'
427
     */
428
    public function countLinksAndRedirects(Page $page)
429
    {
430
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
431
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
432
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
433
434
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
435
                FROM $externalLinksTable WHERE el_from = :id
436
                UNION
437
                SELECT COUNT(*) AS value, 'links_out' AS type
438
                FROM $pageLinksTable WHERE pl_from = :id
439
                UNION
440
                SELECT COUNT(*) AS value, 'links_in' AS type
441
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
442
                UNION
443
                SELECT COUNT(*) AS value, 'redirects' AS type
444
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
445
446
        $params = [
447
            'id' => $page->getId(),
448
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
449
            'namespace' => $page->getNamespace(),
450
        ];
451
452
        $res = $this->executeProjectsQuery($sql, $params);
453
        $data = [];
454
455
        // Transform to associative array by 'type'
456
        foreach ($res as $row) {
457
            $data[$row['type'] . '_count'] = $row['value'];
458
        }
459
460
        return $data;
461
    }
462
463
    /**
464
     * Count wikidata items for the given page, not just languages of sister projects
465
     * @param Page $page
466
     * @return int Number of records.
467
     */
468
    public function countWikidataItems(Page $page)
469
    {
470
        return $this->getWikidataItems($page, true);
471
    }
472
473
    /**
474
     * Get page views for the given page and timeframe.
475
     * @FIXME use Symfony Guzzle package.
476
     * @param Page $page
477
     * @param string|DateTime $start In the format YYYYMMDD
478
     * @param string|DateTime $end In the format YYYYMMDD
479
     * @return string[]
480
     */
481
    public function getPageviews(Page $page, $start, $end)
482
    {
483
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
484
        $client = new GuzzleHttp\Client();
485
486
        if ($start instanceof DateTime) {
487
            $start = $start->format('Ymd');
488
        } else {
489
            $start = (new DateTime($start))->format('Ymd');
490
        }
491
        if ($end instanceof DateTime) {
492
            $end = $end->format('Ymd');
493
        } else {
494
            $end = (new DateTime($end))->format('Ymd');
495
        }
496
497
        $project = $page->getProject()->getDomain();
498
499
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
500
            "$project/all-access/user/$title/daily/$start/$end";
501
502
        $res = $client->request('GET', $url);
503
        return json_decode($res->getBody()->getContents(), true);
504
    }
505
506
    /**
507
     * Get the full HTML content of the the page.
508
     * @param  Page $page
509
     * @param  int $revId What revision to query for.
510
     * @return string
511
     */
512
    public function getHTMLContent(Page $page, $revId = null)
513
    {
514
        $client = new GuzzleHttp\Client();
515
        $url = $page->getUrl();
516
        if ($revId !== null) {
517
            $url .= "?oldid=$revId";
518
        }
519
        return $client->request('GET', $url)
520
            ->getBody()
521
            ->getContents();
522
    }
523
524
    /**
525
     * Get the ID of the revision of a page at the time of the given DateTime.
526
     * @param  Page     $page
527
     * @param  DateTime $date
528
     * @return int
529
     */
530
    public function getRevisionIdAtDate(Page $page, DateTime $date)
531
    {
532
        $revisionTable = $page->getProject()->getTableName('revision');
533
        $pageId = $page->getId();
534
        $datestamp = $date->format('YmdHis');
535
        $sql = "SELECT MAX(rev_id)
536
                FROM $revisionTable
537
                WHERE rev_timestamp <= $datestamp
538
                AND rev_page = $pageId LIMIT 1;";
539
        $resultQuery = $this->getProjectsConnection()->query($sql);
540
        return (int)$resultQuery->fetchColumn();
541
    }
542
543
    /**
544
     * Get HTML display titles of a set of pages (or the normal title if there's no display title).
545
     * This will send t/50 API requests where t is the number of titles supplied.
546
     * @param Project $project The project.
547
     * @param string[] $pageTitles The titles to fetch.
548
     * @return string[] Keys are the original supplied title, and values are the display titles.
549
     * @static
550
     */
551
    public static function displayTitles(Project $project, $pageTitles)
552
    {
553
        $api = $project->getApi();
554
        $displayTitles = [];
555
        $numPages = count($pageTitles);
556
557
        for ($n = 0; $n < $numPages; $n += 50) {
558
            $titleSlice = array_slice($pageTitles, $n, 50);
559
            $params = [
560
                'prop' => 'info|pageprops',
561
                'inprop' => 'displaytitle',
562
                'titles' => join('|', $titleSlice),
563
            ];
564
            $query = new SimpleRequest('query', $params);
565
            $result = $api->postRequest($query);
566
567
            // Extract normalization info.
568
            $normalized = [];
569
            if (isset($result['query']['normalized'])) {
570
                array_map(
571
                    function ($e) use (&$normalized) {
572
                        $normalized[$e['to']] = $e['from'];
573
                    },
574
                    $result['query']['normalized']
575
                );
576
            }
577
578
            // Match up the normalized titles with the display titles and the original titles.
579
            foreach ($result['query']['pages'] as $pageInfo) {
580
                $displayTitle = isset($pageInfo['pageprops']['displaytitle'])
581
                    ? $pageInfo['pageprops']['displaytitle']
582
                    : $pageInfo['title'];
583
                $origTitle = isset($normalized[$pageInfo['title']])
584
                    ? $normalized[$pageInfo['title']] : $pageInfo['title'];
585
                $displayTitles[$origTitle] = $displayTitle;
586
            }
587
        }
588
589
        return $displayTitles;
590
    }
591
}
592