PageInfoApi::getBasicEditingInfo()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types = 1);
4
5
namespace App\Model;
6
7
use App\Exception\BadGatewayException;
8
use App\Helper\AutomatedEditsHelper;
9
use App\Helper\I18nHelper;
10
use App\Repository\PageInfoRepository;
11
use DateTime;
12
use Symfony\Component\DomCrawler\Crawler;
13
use Symfony\Component\HttpKernel\Exception\HttpException;
14
use Symfony\Component\HttpKernel\Exception\ServiceUnavailableHttpException;
15
16
/**
17
 * An PageInfoApi is standalone logic for the PageInfo tool. These methods perform SQL queries
18
 * or make API requests and can be called directly, without any knowledge of the child PageInfo class.
19
 * @see PageInfo
20
 */
21
class PageInfoApi extends Model
22
{
23
    /** @var int Number of days of recent data to show for pageviews. */
24
    public const PAGEVIEWS_OFFSET = 30;
25
26
    protected AutomatedEditsHelper $autoEditsHelper;
27
    protected I18nHelper $i18n;
28
29
    /** @var int Number of revisions that belong to the page. */
30
    protected int $numRevisions;
31
32
    /** @var array Prose stats, with keys 'characters', 'words', 'references', 'unique_references', 'sections'. */
33
    protected array $proseStats;
34
35
    /** @var array Number of categories, templates and files on the page. */
36
    protected array $transclusionData;
37
38
    /** @var array Various statistics about bots that edited the page. */
39
    protected array $bots;
40
41
    /** @var int Number of edits made to the page by bots. */
42
    protected int $botRevisionCount;
43
44
    /** @var int[] Number of in and outgoing links and redirects to the page. */
45
    protected array $linksAndRedirects;
46
47
    /** @var string[]|null Assessments of the page (see Page::getAssessments). */
48
    protected ?array $assessments;
49
50
    /** @var string[] List of Wikidata and Checkwiki errors. */
51
    protected array $bugs;
52
53
    /**
54
     * PageInfoApi constructor.
55
     * @param PageInfoRepository $repository
56
     * @param I18nHelper $i18n
57
     * @param AutomatedEditsHelper $autoEditsHelper
58
     * @param Page $page The page to process.
59
     * @param false|int $start Start date as Unix timestmap.
60
     * @param false|int $end End date as Unix timestamp.
61
     */
62
    public function __construct(
63
        PageInfoRepository $repository,
64
        I18nHelper $i18n,
65
        AutomatedEditsHelper $autoEditsHelper,
66
        Page $page,
67
        $start = false,
68
        $end = false
69
    ) {
70
        $this->repository = $repository;
71
        $this->i18n = $i18n;
72
        $this->autoEditsHelper = $autoEditsHelper;
73
        $this->page = $page;
74
        $this->start = $start;
75
        $this->end = $end;
76
    }
77
78
    /**
79
     * Get the number of revisions belonging to the page.
80
     * @return int
81
     */
82
    public function getNumRevisions(): int
83
    {
84
        if (!isset($this->numRevisions)) {
85
            $this->numRevisions = $this->page->getNumRevisions(null, $this->start, $this->end);
0 ignored issues
show
Bug introduced by
The method getNumRevisions() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

85
            /** @scrutinizer ignore-call */ 
86
            $this->numRevisions = $this->page->getNumRevisions(null, $this->start, $this->end);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
86
        }
87
        return $this->numRevisions;
88
    }
89
90
    /**
91
     * Are there more revisions than we should process, based on the config?
92
     * @return bool
93
     */
94
    public function tooManyRevisions(): bool
95
    {
96
        return $this->repository->getMaxPageRevisions() > 0 &&
0 ignored issues
show
Bug introduced by
The method getMaxPageRevisions() does not exist on App\Repository\Repository. It seems like you code against a sub-type of App\Repository\Repository such as App\Repository\PageInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

96
        return $this->repository->/** @scrutinizer ignore-call */ getMaxPageRevisions() > 0 &&
Loading history...
97
            $this->getNumRevisions() > $this->repository->getMaxPageRevisions();
98
    }
99
100
    /**
101
     * Get various basic info used in the API, including the number of revisions, unique authors, initial author
102
     * and edit count of the initial author. This is combined into one query for better performance. Caching is
103
     * intentionally disabled, because using the gadget, this will get hit for a different page constantly, where
104
     * the likelihood of cache benefiting us is slim.
105
     * @return string[]|false false if the page was not found.
106
     */
107
    public function getBasicEditingInfo()
108
    {
109
        return $this->repository->getBasicEditingInfo($this->page);
0 ignored issues
show
Bug introduced by
The method getBasicEditingInfo() does not exist on App\Repository\Repository. It seems like you code against a sub-type of App\Repository\Repository such as App\Repository\PageInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

109
        return $this->repository->/** @scrutinizer ignore-call */ getBasicEditingInfo($this->page);
Loading history...
110
    }
111
112
    /**
113
     * Get the top editors to the page by edit count.
114
     * @param int $limit Default 20, maximum 1,000.
115
     * @param bool $noBots Set to non-false to exclude bots from the result.
116
     * @return array
117
     */
118
    public function getTopEditorsByEditCount(int $limit = 20, bool $noBots = false): array
119
    {
120
        // Quick cache, valid only for the same request.
121
        static $topEditors = null;
122
        if (null !== $topEditors) {
123
            return $topEditors;
124
        }
125
126
        $rows = $this->repository->getTopEditorsByEditCount(
0 ignored issues
show
Bug introduced by
The method getTopEditorsByEditCount() does not exist on App\Repository\Repository. It seems like you code against a sub-type of App\Repository\Repository such as App\Repository\PageInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

126
        /** @scrutinizer ignore-call */ 
127
        $rows = $this->repository->getTopEditorsByEditCount(
Loading history...
127
            $this->page,
128
            $this->start,
129
            $this->end,
130
            min($limit, 1000),
131
            $noBots
132
        );
133
134
        $topEditors = [];
135
        $rank = 0;
136
        foreach ($rows as $row) {
137
            $topEditors[] = [
138
                'rank' => ++$rank,
139
                'username' => $row['username'],
140
                'count' => $row['count'],
141
                'minor' => $row['minor'],
142
                'first_edit' => [
143
                    'id' => $row['first_revid'],
144
                    'timestamp' => DateTime::createFromFormat('YmdHis', $row['first_timestamp'])
145
                        ->format('Y-m-d\TH:i:s\Z'),
146
                ],
147
                'latest_edit' => [
148
                    'id' => $row['latest_revid'],
149
                    'timestamp' => DateTime::createFromFormat('YmdHis', $row['latest_timestamp'])
150
                        ->format('Y-m-d\TH:i:s\Z'),
151
                ],
152
            ];
153
        }
154
155
        return $topEditors;
156
    }
157
158
    /**
159
     * Get prose and reference information.
160
     * @return array|null With keys 'characters', 'words', 'references', 'unique_references', or null on failure.
161
     */
162
    public function getProseStats(): ?array
163
    {
164
        if (isset($this->proseStats)) {
165
            return $this->proseStats;
166
        }
167
168
        $datetime = is_int($this->end) ? new DateTime("@$this->end") : null;
169
170
        try {
171
            $html = $this->page->getHTMLContent($datetime);
172
        } catch (BadGatewayException $e) {
173
            // Prose stats are non-critical, so handle the BadGatewayException gracefully in the views.
174
            return null;
175
        }
176
177
        $crawler = new Crawler($html);
178
        $refs = $crawler->filter('[typeof~="mw:Extension/ref"]');
179
180
        [$bytes, $chars, $words] = $this->countCharsAndWords($crawler);
181
182
        $refContent = [];
183
        $refs->each(function ($ref) use (&$refContent): void {
184
            $refContent[] = $ref->text();
185
        });
186
        $uniqueRefs = count(array_unique($refContent));
187
188
        $this->proseStats = [
189
            'bytes' => $bytes,
190
            'characters' => $chars,
191
            'words' => $words,
192
            'references' => $refs->count(),
193
            'unique_references' => $uniqueRefs,
194
            'sections' => $crawler->filter('section')->count(),
195
        ];
196
        return $this->proseStats;
197
    }
198
199
    /**
200
     * Count the number of byes, characters and words of the plain text.
201
     * @param Crawler $crawler
202
     * @return array [num bytes, num chars, num words]
203
     */
204
    private function countCharsAndWords(Crawler $crawler): array
205
    {
206
        $totalBytes = 0;
207
        $totalChars = 0;
208
        $totalWords = 0;
209
        $paragraphs = $crawler->filter('section > p');
210
211
        // Remove templates, TemplateStyles, math and reference tags.
212
        $crawler->filter(implode(',', [
213
            '#coordinates',
214
            '[class*="emplate"]',
215
            '[typeof~="mw:Extension/templatestyles"]',
216
            '[typeof~="mw:Extension/math"]',
217
            '[typeof~="mw:Extension/ref"]',
218
        ]))->each(function (Crawler $subCrawler) {
219
            foreach ($subCrawler as $subNode) {
220
                $subNode->parentNode->removeChild($subNode);
221
            }
222
        });
223
224
        $paragraphs->each(function ($node) use (&$totalBytes, &$totalChars, &$totalWords): void {
225
            /** @var Crawler $node */
226
            $text = $node->text();
227
            $totalBytes += strlen($text);
228
            $totalChars += mb_strlen($text);
229
            $totalWords += count(explode(' ', $text));
230
        });
231
232
        return [$totalBytes, $totalChars, $totalWords];
233
    }
234
235
    /**
236
     * Get the page assessments of the page.
237
     * @see https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:PageAssessments
238
     * @return string[]|null null if unsupported.
239
     * @codeCoverageIgnore
240
     */
241
    public function getAssessments(): ?array
242
    {
243
        if (!isset($this->assessments)) {
244
            $this->assessments = $this->page
245
                ->getProject()
246
                ->getPageAssessments()
247
                ->getAssessments($this->page);
0 ignored issues
show
Bug introduced by
It seems like $this->page can also be of type null; however, parameter $page of App\Model\PageAssessments::getAssessments() does only seem to accept App\Model\Page, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

247
                ->getAssessments(/** @scrutinizer ignore-type */ $this->page);
Loading history...
248
        }
249
        return $this->assessments;
250
    }
251
252
    /**
253
     * Get the list of page's wikidata and Checkwiki errors.
254
     * @see Page::getErrors()
255
     * @return string[]
256
     */
257
    public function getBugs(): array
258
    {
259
        if (!isset($this->bugs)) {
260
            $this->bugs = $this->page->getErrors();
261
        }
262
        return $this->bugs;
263
    }
264
265
    /**
266
     * Get the number of wikidata nad CheckWiki errors.
267
     * @return int
268
     */
269
    public function numBugs(): int
270
    {
271
        return count($this->getBugs());
272
    }
273
274
    /**
275
     * Generate the data structure that will used in the PageInfo API response.
276
     * @param Project $project
277
     * @param Page $page
278
     * @return array
279
     * @codeCoverageIgnore
280
     */
281
    public function getPageInfoApiData(Project $project, Page $page): array
282
    {
283
        $data = [
284
            'project' => $project->getDomain(),
285
            'page' => $page->getTitle(),
286
            'watchers' => $page->getWatchers(),
287
            'pageviews' => $page->getLatestPageviews(),
288
            'pageviews_offset' => self::PAGEVIEWS_OFFSET,
289
        ];
290
291
        $info = null;
0 ignored issues
show
Unused Code introduced by
The assignment to $info is dead and can be removed.
Loading history...
292
293
        try {
294
            $info = $this->repository->getBasicEditingInfo($page);
295
        } catch (ServiceUnavailableHttpException $e) {
296
            // No more open database connections.
297
            $data['error'] = 'Unable to fetch revision data. Please try again later.';
298
        } catch (HttpException $e) {
299
            /**
300
             * The query most likely exceeded the maximum query time,
301
             * so we'll abort and give only info retrieved by the API.
302
             */
303
            $data['error'] = 'Unable to fetch revision data. The query may have timed out.';
304
        }
305
306
        if ($info) {
307
            $creationDateTime = DateTime::createFromFormat('YmdHis', $info['created_at']);
308
            $modifiedDateTime = DateTime::createFromFormat('YmdHis', $info['modified_at']);
309
            $secsSinceLastEdit = (new DateTime)->getTimestamp() - $modifiedDateTime->getTimestamp();
310
311
            $assessment = $page->getProject()
312
                ->getPageAssessments()
313
                ->getAssessment($page);
314
315
            $data = array_merge($data, [
316
                'revisions' => (int) $info['num_edits'],
317
                'editors' => (int) $info['num_editors'],
318
                'ip_edits' => (int) $info['ip_edits'],
319
                'minor_edits' => (int) $info['minor_edits'],
320
                'creator' => $info['creator'],
321
                'creator_editcount' => null === $info['creator_editcount'] ? null : (int) $info['creator_editcount'],
322
                'created_at' => $creationDateTime,
323
                'created_rev_id' => $info['created_rev_id'],
324
                'modified_at' => $modifiedDateTime,
325
                'secs_since_last_edit' => $secsSinceLastEdit,
326
                'modified_rev_id' => (int) $info['modified_rev_id'],
327
                'assessment' => $assessment,
328
            ]);
329
        }
330
331
        return $data;
332
    }
333
334
    /************************ Link statistics ************************/
335
336
    /**
337
     * Get the number of external links on the page.
338
     * @return int
339
     */
340
    public function linksExtCount(): int
341
    {
342
        return $this->getLinksAndRedirects()['links_ext_count'];
343
    }
344
345
    /**
346
     * Get the number of incoming links to the page.
347
     * @return int
348
     */
349
    public function linksInCount(): int
350
    {
351
        return $this->getLinksAndRedirects()['links_in_count'];
352
    }
353
354
    /**
355
     * Get the number of outgoing links from the page.
356
     * @return int
357
     */
358
    public function linksOutCount(): int
359
    {
360
        return $this->getLinksAndRedirects()['links_out_count'];
361
    }
362
363
    /**
364
     * Get the number of redirects to the page.
365
     * @return int
366
     */
367
    public function redirectsCount(): int
368
    {
369
        return $this->getLinksAndRedirects()['redirects_count'];
370
    }
371
372
    /**
373
     * Get the number of external, incoming and outgoing links, along with the number of redirects to the page.
374
     * @return int[]
375
     * @codeCoverageIgnore
376
     */
377
    private function getLinksAndRedirects(): array
378
    {
379
        if (!isset($this->linksAndRedirects)) {
380
            $this->linksAndRedirects = $this->page->countLinksAndRedirects();
381
        }
382
        return $this->linksAndRedirects;
383
    }
384
385
    /**
386
     * Fetch transclusion data (categories, templates and files) that are on the page.
387
     * @return array With keys 'categories', 'templates' and 'files'.
388
     */
389
    public function getTransclusionData(): array
390
    {
391
        if (!isset($this->transclusionData)) {
392
            $this->transclusionData = $this->repository->getTransclusionData($this->page);
0 ignored issues
show
Bug introduced by
The method getTransclusionData() does not exist on App\Repository\Repository. It seems like you code against a sub-type of App\Repository\Repository such as App\Repository\PageInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

392
            /** @scrutinizer ignore-call */ 
393
            $this->transclusionData = $this->repository->getTransclusionData($this->page);
Loading history...
393
        }
394
        return $this->transclusionData;
395
    }
396
397
    /**
398
     * Get the number of categories that are on the page.
399
     * @return int
400
     */
401
    public function getNumCategories(): int
402
    {
403
        return $this->getTransclusionData()['categories'];
404
    }
405
406
    /**
407
     * Get the number of templates that are on the page.
408
     * @return int
409
     */
410
    public function getNumTemplates(): int
411
    {
412
        return $this->getTransclusionData()['templates'];
413
    }
414
415
    /**
416
     * Get the number of files that are on the page.
417
     * @return int
418
     */
419
    public function getNumFiles(): int
420
    {
421
        return $this->getTransclusionData()['files'];
422
    }
423
424
    /************************ Bot statistics ************************/
425
426
    /**
427
     * Number of edits made to the page by current or former bots.
428
     * @param string[][] $bots Used only in unit tests, where we supply mock data for the bots that will get processed.
429
     * @return int
430
     */
431
    public function getBotRevisionCount(?array $bots = null): int
432
    {
433
        if (isset($this->botRevisionCount)) {
434
            return $this->botRevisionCount;
435
        }
436
437
        if (null === $bots) {
438
            $bots = $this->getBots();
439
        }
440
441
        $count = 0;
442
443
        foreach (array_values($bots) as $data) {
444
            $count += $data['count'];
445
        }
446
447
        $this->botRevisionCount = $count;
448
        return $count;
449
    }
450
451
    /**
452
     * Get and set $this->bots about bots that edited the page. This is done separately from the main query because
453
     * we use this information when computing the top 10 editors in PageInfo, where we don't want to include bots.
454
     * @return array
455
     */
456
    public function getBots(): array
457
    {
458
        if (isset($this->bots)) {
459
            return $this->bots;
460
        }
461
462
        // Parse the bot edits.
463
        $this->bots = [];
464
465
        $limit = $this->tooManyRevisions() ? $this->repository->getMaxPageRevisions() : null;
466
467
        $botData = $this->repository->getBotData($this->page, $this->start, $this->end, $limit);
0 ignored issues
show
Bug introduced by
The method getBotData() does not exist on App\Repository\Repository. It seems like you code against a sub-type of App\Repository\Repository such as App\Repository\PageInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

467
        /** @scrutinizer ignore-call */ 
468
        $botData = $this->repository->getBotData($this->page, $this->start, $this->end, $limit);
Loading history...
468
        while ($bot = $botData->fetchAssociative()) {
469
            $this->bots[$bot['username']] = [
470
                'count' => (int)$bot['count'],
471
                'current' => '1' === $bot['current'],
472
            ];
473
        }
474
475
        // Sort by edit count.
476
        uasort($this->bots, function ($a, $b) {
477
            return $b['count'] - $a['count'];
478
        });
479
480
        return $this->bots;
481
    }
482
483
    /**
484
     * Get the number of bots that edited the page.
485
     * @return int
486
     */
487
    public function getNumBots(): int
488
    {
489
        return count($this->getBots());
490
    }
491
492
    /**
493
     * Get counts of (semi-)automated tools used to edit the page.
494
     * @return array
495
     */
496
    public function getAutoEditsCounts(): array
497
    {
498
        return $this->repository->getAutoEditsCounts($this->page, $this->start, $this->end);
0 ignored issues
show
Bug introduced by
The method getAutoEditsCounts() does not exist on App\Repository\Repository. It seems like you code against a sub-type of App\Repository\Repository such as App\Repository\PageInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

498
        return $this->repository->/** @scrutinizer ignore-call */ getAutoEditsCounts($this->page, $this->start, $this->end);
Loading history...
499
    }
500
}
501