Passed
Push — master ( 77644d...1d6156 )
by MusikAnimal
06:06
created

PageRepository::getRevisionsStmt()   B

Complexity

Conditions 7
Paths 9

Size

Total Lines 54
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 23
nc 9
nop 6
dl 0
loc 54
rs 8.6186
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
declare(strict_types = 1);
7
8
namespace AppBundle\Repository;
9
10
use AppBundle\Model\Page;
11
use AppBundle\Model\Project;
12
use AppBundle\Model\User;
13
use DateTime;
14
use GuzzleHttp;
15
use Mediawiki\Api\SimpleRequest;
16
17
/**
18
 * A PageRepository fetches data about Pages, either singularly or for multiple.
19
 * Despite the name, this does not have a direct correlation with the Pages tool.
20
 * @codeCoverageIgnore
21
 */
22
class PageRepository extends Repository
23
{
24
    /**
25
     * Get metadata about a single page from the API.
26
     * @param Project $project The project to which the page belongs.
27
     * @param string $pageTitle Page title.
28
     * @return string[]|null Array with some of the following keys: pageid, title, missing, displaytitle, url.
29
     *   Returns null if page does not exist.
30
     */
31
    public function getPageInfo(Project $project, string $pageTitle): ?array
32
    {
33
        $info = $this->getPagesInfo($project, [$pageTitle]);
34
        return null !== $info ? array_shift($info) : null;
35
    }
36
37
    /**
38
     * Get metadata about a set of pages from the API.
39
     * @param Project $project The project to which the pages belong.
40
     * @param string[] $pageTitles Array of page titles.
41
     * @return string[]|null Array keyed by the page names, each element with some of the following keys: pageid,
42
     *   title, missing, displaytitle, url. Returns null if page does not exist.
43
     */
44
    public function getPagesInfo(Project $project, array $pageTitles): ?array
45
    {
46
        // @TODO: Also include 'extlinks' prop when we start checking for dead external links.
47
        $params = [
48
            'prop' => 'info|pageprops',
49
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|readable|displaytitle',
50
            'converttitles' => '',
51
            // 'ellimit' => 20,
52
            // 'elexpandurl' => '',
53
            'titles' => join('|', $pageTitles),
54
            'formatversion' => 2,
55
            // 'pageids' => $pageIds // FIXME: allow page IDs
56
        ];
57
58
        $query = new SimpleRequest('query', $params);
59
        $api = $this->getMediawikiApi($project);
60
        $res = $api->getRequest($query);
61
        $result = [];
62
        if (isset($res['query']['pages'])) {
63
            foreach ($res['query']['pages'] as $pageInfo) {
64
                $result[$pageInfo['title']] = $pageInfo;
65
            }
66
        } else {
67
            return null;
68
        }
69
        return $result;
70
    }
71
72
    /**
73
     * Get the full page text of a set of pages.
74
     * @param Project $project The project to which the pages belong.
75
     * @param string[] $pageTitles Array of page titles.
76
     * @return string[] Array keyed by the page names, with the page text as the values.
77
     */
78
    public function getPagesWikitext(Project $project, array $pageTitles): array
79
    {
80
        $query = new SimpleRequest('query', [
81
            'prop' => 'revisions',
82
            'rvprop' => 'content',
83
            'titles' => join('|', $pageTitles),
84
            'formatversion' => 2,
85
        ]);
86
        $result = [];
87
88
        $api = $this->getMediawikiApi($project);
89
        $res = $api->getRequest($query);
90
91
        if (!isset($res['query']['pages'])) {
92
            return [];
93
        }
94
95
        foreach ($res['query']['pages'] as $page) {
96
            if (isset($page['revisions'][0]['content'])) {
97
                $result[$page['title']] = $page['revisions'][0]['content'];
98
            } else {
99
                $result[$page['title']] = '';
100
            }
101
        }
102
103
        return $result;
104
    }
105
106
    /**
107
     * Get revisions of a single page.
108
     * @param Page $page The page.
109
     * @param User|null $user Specify to get only revisions by the given user.
110
     * @param false|int $start
111
     * @param false|int $end
112
     * @return string[] Each member with keys: id, timestamp, length.
113
     */
114
    public function getRevisions(Page $page, ?User $user = null, $start = false, $end = false): array
115
    {
116
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
117
        if ($this->cache->hasItem($cacheKey)) {
118
            return $this->cache->getItem($cacheKey)->get();
119
        }
120
121
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
122
        $result = $stmt->fetchAll();
123
124
        // Cache and return.
125
        return $this->setCache($cacheKey, $result);
126
    }
127
128
    /**
129
     * Get the statement for a single revision, so that you can iterate row by row.
130
     * @param Page $page The page.
131
     * @param User|null $user Specify to get only revisions by the given user.
132
     * @param int $limit Max number of revisions to process.
133
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
134
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
135
     *   a separate query is ran to get the number of revisions.
136
     * @param false|int $start
137
     * @param false|int $end
138
     * @return \Doctrine\DBAL\Driver\Statement
139
     */
140
    public function getRevisionsStmt(
141
        Page $page,
142
        ?User $user = null,
143
        ?int $limit = null,
144
        ?int $numRevisions = null,
145
        $start = false,
146
        $end = false
147
    ): \Doctrine\DBAL\Driver\Statement {
148
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
149
        if ($this->cache->hasItem($cacheKey)) {
150
            return $this->cache->getItem($cacheKey)->get();
151
        }
152
153
        $revTable = $this->getTableName(
154
            $page->getProject()->getDatabaseName(),
155
            'revision',
156
            $user ? null : '' // Use 'revision' if there's no user, otherwise default to revision_userindex
157
        );
158
        $commentTable = $this->getTableName($page->getProject()->getDatabaseName(), 'comment');
159
        $userClause = $user ? "revs.rev_user_text = :username AND " : "";
160
161
        $limitClause = '';
162
        if (intval($limit) > 0 && isset($numRevisions)) {
163
            $limitClause = "LIMIT $limit";
164
        }
165
166
        $dateConditions = $this->getDateConditions($start, $end, 'revs.');
167
168
        $sql = "SELECT * FROM (
169
                    SELECT
170
                        revs.rev_id AS id,
171
                        revs.rev_timestamp AS timestamp,
172
                        revs.rev_minor_edit AS minor,
173
                        revs.rev_len AS length,
174
                        (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
175
                        revs.rev_user AS user_id,
176
                        revs.rev_user_text AS username,
177
                        comment_text AS `comment`,
178
                        revs.rev_sha1 AS sha
179
                    FROM $revTable AS revs
180
                    LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
181
                    LEFT OUTER JOIN $commentTable ON comment_id = revs.rev_comment_id
182
                    WHERE $userClause revs.rev_page = :pageid $dateConditions
183
                    ORDER BY revs.rev_timestamp DESC
184
                    $limitClause
185
                ) a
186
                ORDER BY timestamp ASC";
187
188
        $params = ['pageid' => $page->getId()];
189
        if ($user) {
190
            $params['username'] = $user->getUsername();
191
        }
192
193
        return $this->setCache($cacheKey, $this->executeProjectsQuery($sql, $params));
194
    }
195
196
    /**
197
     * Get a count of the number of revisions of a single page
198
     * @param Page $page The page.
199
     * @param User|null $user Specify to only count revisions by the given user.
200
     * @param false|int $start
201
     * @param false|int $end
202
     * @return int
203
     */
204
    public function getNumRevisions(Page $page, ?User $user = null, $start = false, $end = false): int
205
    {
206
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
207
        if ($this->cache->hasItem($cacheKey)) {
208
            return $this->cache->getItem($cacheKey)->get();
209
        }
210
211
        // In this case revision is faster than revision_userindex if we're not querying by user.
212
        $revTable = $page->getProject()->getTableName(
213
            'revision',
214
            $user && $this->isLabs() ? '_userindex' : ''
215
        );
216
        $userClause = $user ? "rev_user_text = :username AND " : "";
217
218
        $dateConditions = $this->getDateConditions($start, $end);
219
220
        $sql = "SELECT COUNT(*)
221
                FROM $revTable
222
                WHERE $userClause rev_page = :pageid $dateConditions";
223
        $params = ['pageid' => $page->getId()];
224
        if ($user) {
225
            $params['username'] = $user->getUsername();
226
        }
227
228
        $result = (int)$this->executeProjectsQuery($sql, $params)->fetchColumn(0);
229
230
        // Cache and return.
231
        return $this->setCache($cacheKey, $result);
232
    }
233
234
    /**
235
     * Get various basic info used in the API, including the number of revisions, unique authors, initial author
236
     * and edit count of the initial author. This is combined into one query for better performance. Caching is only
237
     * applied if it took considerable time to process, because using the gadget, this will get hit for a different page
238
     * constantly, where the likelihood of cache benefiting us is slim.
239
     * @param Page $page The page.
240
     * @return string[]|false false if the page was not found.
241
     */
242
    public function getBasicEditingInfo(Page $page)
243
    {
244
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_basicinfo');
245
        if ($this->cache->hasItem($cacheKey)) {
246
            return $this->cache->getItem($cacheKey)->get();
247
        }
248
249
        $revTable = $this->getTableName($page->getProject()->getDatabaseName(), 'revision');
250
        $userTable = $this->getTableName($page->getProject()->getDatabaseName(), 'user');
251
        $pageTable = $this->getTableName($page->getProject()->getDatabaseName(), 'page');
252
253
        $sql = "SELECT *, (
254
                   SELECT user_editcount
255
                   FROM $userTable
256
                   WHERE user_name = author
257
                ) AS author_editcount
258
                FROM (
259
                    (
260
                        SELECT COUNT(rev_id) AS num_edits,
261
                            COUNT(DISTINCT(rev_user_text)) AS num_editors
262
                        FROM $revTable
263
                        WHERE rev_page = :pageid
264
                        AND rev_timestamp > 0 # Use rev_timestamp index
265
                    ) a,
266
                    (
267
                        # With really old pages, the rev_timestamp may need to be sorted ASC,
268
                        #   and the lowest rev_id may not be the first revision.
269
                        SELECT rev_user_text AS author,
270
                               rev_timestamp AS created_at,
271
                               rev_id AS created_rev_id
272
                        FROM $revTable
273
                        WHERE rev_page = :pageid
274
                        ORDER BY rev_timestamp ASC
275
                        LIMIT 1
276
                    ) b,
277
                    (
278
                        SELECT rev_timestamp AS modified_at,
279
                               rev_id AS modified_rev_id
280
                        FROM $revTable
281
                        JOIN $pageTable ON page_id = rev_page
282
                        WHERE rev_page = :pageid
283
                        AND rev_id = page_latest
284
                    ) c
285
                );";
286
        $params = ['pageid' => $page->getId()];
287
288
        // Get current time so we can compare timestamps
289
        // and decide whether or to cache the result.
290
        $time1 = time();
291
292
        /**
293
         * This query can sometimes take too long to run for pages with tens of thousands
294
         * of revisions. This query is used by the ArticleInfo gadget, which shows basic
295
         * data in real-time, so if it takes too long than the user probably didn't even
296
         * wait to see the result. We'll pass 60 as the last parameter to executeProjectsQuery,
297
         * which will set the max_statement_time to 60 seconds.
298
         */
299
        $result = $this->executeProjectsQuery($sql, $params, 60)->fetch();
300
301
        $time2 = time();
302
303
        // If it took over 5 seconds, cache the result for 20 minutes.
304
        if ($time2 - $time1 > 5) {
305
            $this->setCache($cacheKey, $result, 'PT20M');
306
        }
307
308
        return $result ?? false;
309
    }
310
311
    /**
312
     * Get any CheckWiki errors of a single page
313
     * @param Page $page
314
     * @return array Results from query
315
     */
316
    public function getCheckWikiErrors(Page $page): array
317
    {
318
        // Only support mainspace on Labs installations
319
        if (0 !== $page->getNamespace() || !$this->isLabs()) {
320
            return [];
321
        }
322
323
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
324
                FROM s51080__checkwiki_p.cw_error a
325
                JOIN s51080__checkwiki_p.cw_overview_errors b
326
                WHERE a.project = b.project
327
                AND a.project = :dbName
328
                AND a.title = :title
329
                AND a.error = b.id
330
                AND a.ok = 0";
331
332
        // remove _p if present
333
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
334
335
        // Page title without underscores (str_replace just to be sure)
336
        $pageTitle = str_replace('_', ' ', $page->getTitle());
337
338
        $resultQuery = $this->getToolsConnection()->prepare($sql);
339
        $resultQuery->bindParam(':dbName', $dbName);
340
        $resultQuery->bindParam(':title', $pageTitle);
341
        $resultQuery->execute();
342
343
        return $resultQuery->fetchAll();
344
    }
345
346
    /**
347
     * Get basic wikidata on the page: label and description.
348
     * @param Page $page
349
     * @return string[] In the format:
350
     *    [[
351
     *         'term' => string such as 'label',
352
     *         'term_text' => string (value for 'label'),
353
     *     ], ... ]
354
     */
355
    public function getWikidataInfo(Page $page): array
356
    {
357
        if (empty($page->getWikidataId())) {
358
            return [];
359
        }
360
361
        $wikidataId = 'Q'.ltrim($page->getWikidataId(), 'Q');
362
        $lang = $page->getProject()->getLang();
363
364
        $sql = "SELECT term_type AS term, term_text
365
                FROM wikidatawiki_p.wb_terms
366
                WHERE term_full_entity_id = :wikidataId
367
                AND term_type IN ('label', 'description')
368
                AND term_language = :lang";
369
370
        return $this->executeProjectsQuery($sql, [
371
            'lang' => $lang,
372
            'wikidataId' => $wikidataId,
373
        ])->fetchAll();
374
    }
375
376
    /**
377
     * Get or count all wikidata items for the given page,
378
     *     not just languages of sister projects
379
     * @param Page $page
380
     * @param bool $count Set to true to get only a COUNT
381
     * @return string[]|int Records as returend by the DB,
382
     *                      or raw COUNT of the records.
383
     */
384
    public function getWikidataItems(Page $page, bool $count = false)
385
    {
386
        if (!$page->getWikidataId()) {
387
            return $count ? 0 : [];
388
        }
389
390
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
391
392
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
393
                FROM wikidatawiki_p.wb_items_per_site
394
                WHERE ips_item_id = :wikidataId";
395
396
        $result = $this->executeProjectsQuery($sql, [
397
            'wikidataId' => $wikidataId,
398
        ])->fetchAll();
399
400
        return $count ? (int) $result[0]['count'] : $result;
401
    }
402
403
    /**
404
     * Get number of in and outgoing links and redirects to the given page.
405
     * @param Page $page
406
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
407
     *                  'links_in_count' and 'redirects_count'
408
     */
409
    public function countLinksAndRedirects(Page $page): array
410
    {
411
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
412
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
413
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
414
415
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
416
                FROM $externalLinksTable WHERE el_from = :id
417
                UNION
418
                SELECT COUNT(*) AS value, 'links_out' AS type
419
                FROM $pageLinksTable WHERE pl_from = :id
420
                UNION
421
                SELECT COUNT(*) AS value, 'links_in' AS type
422
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
423
                UNION
424
                SELECT COUNT(*) AS value, 'redirects' AS type
425
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
426
427
        $params = [
428
            'id' => $page->getId(),
429
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
430
            'namespace' => $page->getNamespace(),
431
        ];
432
433
        $res = $this->executeProjectsQuery($sql, $params);
434
        $data = [];
435
436
        // Transform to associative array by 'type'
437
        foreach ($res as $row) {
438
            $data[$row['type'] . '_count'] = (int)$row['value'];
439
        }
440
441
        return $data;
442
    }
443
444
    /**
445
     * Count wikidata items for the given page, not just languages of sister projects
446
     * @param Page $page
447
     * @return int Number of records.
448
     */
449
    public function countWikidataItems(Page $page): int
450
    {
451
        return $this->getWikidataItems($page, true);
452
    }
453
454
    /**
455
     * Get page views for the given page and timeframe.
456
     * @fixme use Symfony Guzzle package.
457
     * @param Page $page
458
     * @param string|DateTime $start In the format YYYYMMDD
459
     * @param string|DateTime $end In the format YYYYMMDD
460
     * @return string[]
461
     */
462
    public function getPageviews(Page $page, $start, $end): array
463
    {
464
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
465
        $client = new GuzzleHttp\Client();
466
467
        if ($start instanceof DateTime) {
468
            $start = $start->format('Ymd');
469
        } else {
470
            $start = (new DateTime($start))->format('Ymd');
471
        }
472
        if ($end instanceof DateTime) {
473
            $end = $end->format('Ymd');
474
        } else {
475
            $end = (new DateTime($end))->format('Ymd');
476
        }
477
478
        $project = $page->getProject()->getDomain();
479
480
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
481
            "$project/all-access/user/$title/daily/$start/$end";
482
483
        $res = $client->request('GET', $url);
484
        return json_decode($res->getBody()->getContents(), true);
485
    }
486
487
    /**
488
     * Get the full HTML content of the the page.
489
     * @param Page $page
490
     * @param int $revId What revision to query for.
491
     * @return string
492
     */
493
    public function getHTMLContent(Page $page, ?int $revId = null): string
494
    {
495
        $client = new GuzzleHttp\Client();
496
        $url = $page->getUrl();
497
        if (null !== $revId) {
498
            $url .= "?oldid=$revId";
499
        }
500
        return $client->request('GET', $url)
501
            ->getBody()
502
            ->getContents();
503
    }
504
505
    /**
506
     * Get the ID of the revision of a page at the time of the given DateTime.
507
     * @param Page $page
508
     * @param DateTime $date
509
     * @return int
510
     */
511
    public function getRevisionIdAtDate(Page $page, DateTime $date): int
512
    {
513
        $revisionTable = $page->getProject()->getTableName('revision');
514
        $pageId = $page->getId();
515
        $datestamp = $date->format('YmdHis');
516
        $sql = "SELECT MAX(rev_id)
517
                FROM $revisionTable
518
                WHERE rev_timestamp <= $datestamp
519
                AND rev_page = $pageId LIMIT 1;";
520
        $resultQuery = $this->getProjectsConnection()->query($sql);
521
        return (int)$resultQuery->fetchColumn();
522
    }
523
524
    /**
525
     * Get HTML display titles of a set of pages (or the normal title if there's no display title).
526
     * This will send t/50 API requests where t is the number of titles supplied.
527
     * @param Project $project The project.
528
     * @param string[] $pageTitles The titles to fetch.
529
     * @return string[] Keys are the original supplied title, and values are the display titles.
530
     * @static
531
     */
532
    public function displayTitles(Project $project, array $pageTitles): array
533
    {
534
        $client = $this->container->get('guzzle.client.xtools');
535
536
        $displayTitles = [];
537
        $numPages = count($pageTitles);
538
539
        for ($n = 0; $n < $numPages; $n += 50) {
540
            $titleSlice = array_slice($pageTitles, $n, 50);
541
            $res = $client->request('GET', $project->getApiUrl(), ['query' => [
542
                'action' => 'query',
543
                'prop' => 'info|pageprops',
544
                'inprop' => 'displaytitle',
545
                'titles' => join('|', $titleSlice),
546
                'format' => 'json',
547
            ]]);
548
            $result = json_decode($res->getBody()->getContents(), true);
549
550
            // Extract normalization info.
551
            $normalized = [];
552
            if (isset($result['query']['normalized'])) {
553
                array_map(
554
                    function ($e) use (&$normalized): void {
555
                        $normalized[$e['to']] = $e['from'];
556
                    },
557
                    $result['query']['normalized']
558
                );
559
            }
560
561
            // Match up the normalized titles with the display titles and the original titles.
562
            foreach ($result['query']['pages'] as $pageInfo) {
563
                $displayTitle = $pageInfo['pageprops']['displaytitle'] ?? $pageInfo['title'];
564
                $origTitle = $normalized[$pageInfo['title']] ?? $pageInfo['title'];
565
                $displayTitles[$origTitle] = $displayTitle;
566
            }
567
        }
568
569
        return $displayTitles;
570
    }
571
}
572