Test Failed
Push — master ( cd250e...0bf454 )
by
unknown
12:40
created

PageRepository::getPagesWikitext()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 24
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 15
nc 4
nop 2
dl 0
loc 24
rs 9.7666
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file contains only the PageRepository class.
4
 */
5
6
declare(strict_types = 1);
7
8
namespace AppBundle\Repository;
9
10
use AppBundle\Model\Page;
11
use AppBundle\Model\Project;
12
use AppBundle\Model\User;
13
use DateTime;
14
use Doctrine\DBAL\Driver\Statement;
15
use GuzzleHttp;
16
17
/**
18
 * A PageRepository fetches data about Pages, either singularly or for multiple.
19
 * Despite the name, this does not have a direct correlation with the Pages tool.
20
 * @codeCoverageIgnore
21
 */
22
class PageRepository extends Repository
23
{
24
    /**
25
     * Get metadata about a single page from the API.
26
     * @param Project $project The project to which the page belongs.
27
     * @param string $pageTitle Page title.
28
     * @return string[]|null Array with some of the following keys: pageid, title, missing, displaytitle, url.
29
     *   Returns null if page does not exist.
30
     */
31
    public function getPageInfo(Project $project, string $pageTitle): ?array
32
    {
33
        $info = $this->getPagesInfo($project, [$pageTitle]);
34
        return null !== $info ? array_shift($info) : null;
35
    }
36
37
    /**
38
     * Get metadata about a set of pages from the API.
39
     * @param Project $project The project to which the pages belong.
40
     * @param string[] $pageTitles Array of page titles.
41
     * @return string[]|null Array keyed by the page names, each element with some of the following keys: pageid,
42
     *   title, missing, displaytitle, url. Returns null if page does not exist.
43
     */
44
    public function getPagesInfo(Project $project, array $pageTitles): ?array
45
    {
46
        $params = [
47
            'prop' => 'info|pageprops',
48
            'inprop' => 'protection|talkid|watched|watchers|notificationtimestamp|subjectid|url|displaytitle',
49
            'converttitles' => '',
50
            'titles' => join('|', $pageTitles),
51
            'formatversion' => 2,
52
        ];
53
54
        $res = $this->executeApiRequest($project, $params);
55
        $result = [];
56
        if (isset($res['query']['pages'])) {
57
            foreach ($res['query']['pages'] as $pageInfo) {
58
                $result[$pageInfo['title']] = $pageInfo;
59
            }
60
        } else {
61
            return null;
62
        }
63
        return $result;
64
    }
65
66
    /**
67
     * Get the full page text of a set of pages.
68
     * @param Project $project The project to which the pages belong.
69
     * @param string[] $pageTitles Array of page titles.
70
     * @return string[] Array keyed by the page names, with the page text as the values.
71
     */
72
    public function getPagesWikitext(Project $project, array $pageTitles): array
73
    {
74
        $params = [
75
            'prop' => 'revisions',
76
            'rvprop' => 'content',
77
            'titles' => join('|', $pageTitles),
78
            'formatversion' => 2,
79
        ];
80
        $res = $this->executeApiRequest($project, $params);
81
        $result = [];
82
83
        if (!isset($res['query']['pages'])) {
84
            return [];
85
        }
86
87
        foreach ($res['query']['pages'] as $page) {
88
            if (isset($page['revisions'][0]['content'])) {
89
                $result[$page['title']] = $page['revisions'][0]['content'];
90
            } else {
91
                $result[$page['title']] = '';
92
            }
93
        }
94
95
        return $result;
96
    }
97
98
    /**
99
     * Get revisions of a single page.
100
     * @param Page $page The page.
101
     * @param User|null $user Specify to get only revisions by the given user.
102
     * @param false|int $start
103
     * @param false|int $end
104
     * @return string[] Each member with keys: id, timestamp, length.
105
     */
106
    public function getRevisions(Page $page, ?User $user = null, $start = false, $end = false): array
107
    {
108
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
109
        if ($this->cache->hasItem($cacheKey)) {
110
            return $this->cache->getItem($cacheKey)->get();
111
        }
112
113
        $stmt = $this->getRevisionsStmt($page, $user, null, null, $start, $end);
114
        $result = $stmt->fetchAll();
115
116
        // Cache and return.
117
        return $this->setCache($cacheKey, $result);
118
    }
119
120
    /**
121
     * Get the statement for a single revision, so that you can iterate row by row.
122
     * @param Page $page The page.
123
     * @param User|null $user Specify to get only revisions by the given user.
124
     * @param int $limit Max number of revisions to process.
125
     * @param int $numRevisions Number of revisions, if known. This is used solely to determine the
126
     *   OFFSET if we are given a $limit (see below). If $limit is set and $numRevisions is not set,
127
     *   a separate query is ran to get the number of revisions.
128
     * @param false|int $start
129
     * @param false|int $end
130
     * @return Statement
131
     */
132
    public function getRevisionsStmt(
133
        Page $page,
134
        ?User $user = null,
135
        ?int $limit = null,
136
        ?int $numRevisions = null,
137
        $start = false,
138
        $end = false
139
    ): Statement {
140
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_revisions');
141
        if ($this->cache->hasItem($cacheKey)) {
142
            return $this->cache->getItem($cacheKey)->get();
143
        }
144
145
        $revTable = $this->getTableName(
146
            $page->getProject()->getDatabaseName(),
147
            'revision',
148
            $user ? null : '' // Use 'revision' if there's no user, otherwise default to revision_userindex
149
        );
150
        $commentTable = $page->getProject()->getTableName('comment');
151
        $actorTable = $page->getProject()->getTableName('actor');
152
        $userClause = $user ? "revs.rev_actor = :actorId AND " : "";
153
154
        $limitClause = '';
155
        if (intval($limit) > 0 && isset($numRevisions)) {
156
            $limitClause = "LIMIT $limit";
157
        }
158
159
        $dateConditions = $this->getDateConditions($start, $end, 'revs.');
160
161
        $sql = "SELECT * FROM (
162
                    SELECT
163
                        revs.rev_id AS id,
164
                        revs.rev_timestamp AS timestamp,
165
                        revs.rev_minor_edit AS minor,
166
                        revs.rev_len AS length,
167
                        (CAST(revs.rev_len AS SIGNED) - IFNULL(parentrevs.rev_len, 0)) AS length_change,
168
                        actor_user AS user_id,
169
                        actor_name AS username,
170
                        comment_text AS `comment`,
171
                        revs.rev_sha1 AS sha
172
                    FROM $revTable AS revs
173
                    JOIN $actorTable ON revs.rev_actor = actor_id
174
                    LEFT JOIN $revTable AS parentrevs ON (revs.rev_parent_id = parentrevs.rev_id)
175
                    LEFT OUTER JOIN $commentTable ON comment_id = revs.rev_comment_id
176
                    WHERE $userClause revs.rev_page = :pageid $dateConditions
177
                    ORDER BY revs.rev_timestamp DESC
178
                    $limitClause
179
                ) a
180
                ORDER BY timestamp ASC";
181
182
        $params = ['pageid' => $page->getId()];
183
        if ($user) {
184
            $params['actorId'] = $user->getActorId($page->getProject());
185
        }
186
187
        return $this->setCache($cacheKey, $this->executeProjectsQuery($sql, $params));
188
    }
189
190
    /**
191
     * Get a count of the number of revisions of a single page
192
     * @param Page $page The page.
193
     * @param User|null $user Specify to only count revisions by the given user.
194
     * @param false|int $start
195
     * @param false|int $end
196
     * @return int
197
     */
198
    public function getNumRevisions(Page $page, ?User $user = null, $start = false, $end = false): int
199
    {
200
        $cacheKey = $this->getCacheKey(func_get_args(), 'page_numrevisions');
201
        if ($this->cache->hasItem($cacheKey)) {
202
            return $this->cache->getItem($cacheKey)->get();
203
        }
204
205
        // In this case revision is faster than revision_userindex if we're not querying by user.
206
        $revTable = $page->getProject()->getTableName(
207
            'revision',
208
            $user && $this->isLabs() ? '_userindex' : ''
209
        );
210
        $userClause = $user ? "rev_actor = :actorId AND " : "";
211
212
        $dateConditions = $this->getDateConditions($start, $end);
213
214
        $sql = "SELECT COUNT(*)
215
                FROM $revTable
216
                WHERE $userClause rev_page = :pageid $dateConditions";
217
        $params = ['pageid' => $page->getId()];
218
        if ($user) {
219
            $params['rev_actor'] = $user->getActorId($page->getProject());
220
        }
221
222
        $result = (int)$this->executeProjectsQuery($sql, $params)->fetchColumn(0);
223
224
        // Cache and return.
225
        return $this->setCache($cacheKey, $result);
226
    }
227
228
    /**
229
     * Get any CheckWiki errors of a single page
230
     * @param Page $page
231
     * @return array Results from query
232
     */
233
    public function getCheckWikiErrors(Page $page): array
234
    {
235
        // Only support mainspace on Labs installations
236
        if (0 !== $page->getNamespace() || !$this->isLabs()) {
237
            return [];
238
        }
239
240
        $sql = "SELECT error, notice, found, name_trans AS name, prio, text_trans AS explanation
241
                FROM s51080__checkwiki_p.cw_error a
242
                JOIN s51080__checkwiki_p.cw_overview_errors b
243
                WHERE a.project = b.project
244
                AND a.project = :dbName
245
                AND a.title = :title
246
                AND a.error = b.id
247
                AND a.ok = 0";
248
249
        // remove _p if present
250
        $dbName = preg_replace('/_p$/', '', $page->getProject()->getDatabaseName());
251
252
        // Page title without underscores (str_replace just to be sure)
253
        $pageTitle = str_replace('_', ' ', $page->getTitle());
254
255
        $resultQuery = $this->getToolsConnection()->prepare($sql);
256
        $resultQuery->bindParam(':dbName', $dbName);
257
        $resultQuery->bindParam(':title', $pageTitle);
258
        $resultQuery->execute();
259
260
        return $resultQuery->fetchAll();
261
    }
262
263
    /**
264
     * Get basic wikidata on the page: label and description.
265
     * @param Page $page
266
     * @return string[] In the format:
267
     *    [[
268
     *         'term' => string such as 'label',
269
     *         'term_text' => string (value for 'label'),
270
     *     ], ... ]
271
     */
272
    public function getWikidataInfo(Page $page): array
273
    {
274
        if (empty($page->getWikidataId())) {
275
            return [];
276
        }
277
278
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
279
        $lang = $page->getProject()->getLang();
280
        $wdp = 'wikidatawiki_p';
281
282
        $sql = "SELECT wby_name AS term, wbx_text AS term_text
283
                FROM $wdp.wbt_item_terms
284
                JOIN $wdp.wbt_term_in_lang ON wbit_term_in_lang_id = wbtl_id
285
                JOIN $wdp.wbt_type ON wbtl_type_id = wby_id
286
                JOIN $wdp.wbt_text_in_lang ON wbtl_text_in_lang_id = wbxl_id
287
                JOIN $wdp.wbt_text ON wbxl_text_id = wbx_id
288
                WHERE wbit_item_id = :wikidataId
289
                AND wby_name IN ('label', 'description')
290
                AND wbxl_language = :lang";
291
292
        return $this->executeProjectsQuery($sql, [
293
            'lang' => $lang,
294
            'wikidataId' => $wikidataId,
295
        ])->fetchAll();
296
    }
297
298
    /**
299
     * Get or count all wikidata items for the given page,
300
     *     not just languages of sister projects
301
     * @param Page $page
302
     * @param bool $count Set to true to get only a COUNT
303
     * @return string[]|int Records as returend by the DB,
304
     *                      or raw COUNT of the records.
305
     */
306
    public function getWikidataItems(Page $page, bool $count = false)
307
    {
308
        if (!$page->getWikidataId()) {
309
            return $count ? 0 : [];
310
        }
311
312
        $wikidataId = ltrim($page->getWikidataId(), 'Q');
313
314
        $sql = "SELECT " . ($count ? 'COUNT(*) AS count' : '*') . "
315
                FROM wikidatawiki_p.wb_items_per_site
316
                WHERE ips_item_id = :wikidataId";
317
318
        $result = $this->executeProjectsQuery($sql, [
319
            'wikidataId' => $wikidataId,
320
        ])->fetchAll();
321
322
        return $count ? (int) $result[0]['count'] : $result;
323
    }
324
325
    /**
326
     * Get number of in and outgoing links and redirects to the given page.
327
     * @param Page $page
328
     * @return string[] Counts with the keys 'links_ext_count', 'links_out_count',
329
     *                  'links_in_count' and 'redirects_count'
330
     */
331
    public function countLinksAndRedirects(Page $page): array
332
    {
333
        $externalLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'externallinks');
334
        $pageLinksTable = $this->getTableName($page->getProject()->getDatabaseName(), 'pagelinks');
335
        $redirectTable = $this->getTableName($page->getProject()->getDatabaseName(), 'redirect');
336
337
        $sql = "SELECT COUNT(*) AS value, 'links_ext' AS type
338
                FROM $externalLinksTable WHERE el_from = :id
339
                UNION
340
                SELECT COUNT(*) AS value, 'links_out' AS type
341
                FROM $pageLinksTable WHERE pl_from = :id
342
                UNION
343
                SELECT COUNT(*) AS value, 'links_in' AS type
344
                FROM $pageLinksTable WHERE pl_namespace = :namespace AND pl_title = :title
345
                UNION
346
                SELECT COUNT(*) AS value, 'redirects' AS type
347
                FROM $redirectTable WHERE rd_namespace = :namespace AND rd_title = :title";
348
349
        $params = [
350
            'id' => $page->getId(),
351
            'title' => str_replace(' ', '_', $page->getTitleWithoutNamespace()),
352
            'namespace' => $page->getNamespace(),
353
        ];
354
355
        $res = $this->executeProjectsQuery($sql, $params);
356
        $data = [];
357
358
        // Transform to associative array by 'type'
359
        foreach ($res as $row) {
360
            $data[$row['type'] . '_count'] = (int)$row['value'];
361
        }
362
363
        return $data;
364
    }
365
366
    /**
367
     * Count wikidata items for the given page, not just languages of sister projects
368
     * @param Page $page
369
     * @return int Number of records.
370
     */
371
    public function countWikidataItems(Page $page): int
372
    {
373
        return $this->getWikidataItems($page, true);
374
    }
375
376
    /**
377
     * Get page views for the given page and timeframe.
378
     * @fixme use Symfony Guzzle package.
379
     * @param Page $page
380
     * @param string|DateTime $start In the format YYYYMMDD
381
     * @param string|DateTime $end In the format YYYYMMDD
382
     * @return string[]
383
     */
384
    public function getPageviews(Page $page, $start, $end): array
385
    {
386
        $title = rawurlencode(str_replace(' ', '_', $page->getTitle()));
387
388
        /** @var GuzzleHttp\Client $client */
389
        $client = $this->container->get('eight_points_guzzle.client.xtools');
390
391
        if ($start instanceof DateTime) {
392
            $start = $start->format('Ymd');
393
        } else {
394
            $start = (new DateTime($start))->format('Ymd');
395
        }
396
        if ($end instanceof DateTime) {
397
            $end = $end->format('Ymd');
398
        } else {
399
            $end = (new DateTime($end))->format('Ymd');
400
        }
401
402
        $project = $page->getProject()->getDomain();
403
404
        $url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' .
405
            "$project/all-access/user/$title/daily/$start/$end";
406
407
        $res = $client->request('GET', $url);
408
        return json_decode($res->getBody()->getContents(), true);
409
    }
410
411
    /**
412
     * Get the full HTML content of the the page.
413
     * @param Page $page
414
     * @param int $revId What revision to query for.
415
     * @return string
416
     */
417
    public function getHTMLContent(Page $page, ?int $revId = null): string
418
    {
419
        /** @var GuzzleHttp\Client $client */
420
        $client = $this->container->get('eight_points_guzzle.client.xtools');
421
        $url = $page->getUrl();
422
        if (null !== $revId) {
423
            $url .= "?oldid=$revId";
424
        }
425
        return $client->request('GET', $url)
426
            ->getBody()
427
            ->getContents();
428
    }
429
430
    /**
431
     * Get the ID of the revision of a page at the time of the given DateTime.
432
     * @param Page $page
433
     * @param DateTime $date
434
     * @return int
435
     */
436
    public function getRevisionIdAtDate(Page $page, DateTime $date): int
437
    {
438
        $revisionTable = $page->getProject()->getTableName('revision');
439
        $pageId = $page->getId();
440
        $datestamp = $date->format('YmdHis');
441
        $sql = "SELECT MAX(rev_id)
442
                FROM $revisionTable
443
                WHERE rev_timestamp <= $datestamp
444
                AND rev_page = $pageId LIMIT 1;";
445
        $resultQuery = $this->getProjectsConnection()->query($sql);
446
        return (int)$resultQuery->fetchColumn();
447
    }
448
449
    /**
450
     * Get HTML display titles of a set of pages (or the normal title if there's no display title).
451
     * This will send t/50 API requests where t is the number of titles supplied.
452
     * @param Project $project The project.
453
     * @param string[] $pageTitles The titles to fetch.
454
     * @return string[] Keys are the original supplied title, and values are the display titles.
455
     * @static
456
     */
457
    public function displayTitles(Project $project, array $pageTitles): array
458
    {
459
        $client = $this->container->get('guzzle.client.xtools');
460
461
        $displayTitles = [];
462
        $numPages = count($pageTitles);
463
464
        for ($n = 0; $n < $numPages; $n += 50) {
465
            $titleSlice = array_slice($pageTitles, $n, 50);
466
            $res = $client->request('GET', $project->getApiUrl(), ['query' => [
467
                'action' => 'query',
468
                'prop' => 'info|pageprops',
469
                'inprop' => 'displaytitle',
470
                'titles' => join('|', $titleSlice),
471
                'format' => 'json',
472
            ]]);
473
            $result = json_decode($res->getBody()->getContents(), true);
474
475
            // Extract normalization info.
476
            $normalized = [];
477
            if (isset($result['query']['normalized'])) {
478
                array_map(
479
                    function ($e) use (&$normalized): void {
480
                        $normalized[$e['to']] = $e['from'];
481
                    },
482
                    $result['query']['normalized']
483
                );
484
            }
485
486
            // Match up the normalized titles with the display titles and the original titles.
487
            foreach ($result['query']['pages'] as $pageInfo) {
488
                $displayTitle = $pageInfo['pageprops']['displaytitle'] ?? $pageInfo['title'];
489
                $origTitle = $normalized[$pageInfo['title']] ?? $pageInfo['title'];
490
                $displayTitles[$origTitle] = $displayTitle;
491
            }
492
        }
493
494
        return $displayTitles;
495
    }
496
}
497