Passed
Branch master (0917e1)
by MusikAnimal
11:12
created

ArticleInfoApi::getTransclusionData()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 4
nc 2
nop 0
dl 0
loc 7
ccs 5
cts 5
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
declare(strict_types = 1);
3
4
namespace AppBundle\Model;
5
6
use AppBundle\Repository\ArticleInfoRepository;
7
use DateTime;
8
use Doctrine\DBAL\Statement;
9
use Symfony\Component\DependencyInjection\ContainerInterface;
10
use Symfony\Component\DomCrawler\Crawler;
11
use Symfony\Component\HttpKernel\Exception\HttpException;
12
use Symfony\Component\HttpKernel\Exception\ServiceUnavailableHttpException;
13
14
/**
15
 * An ArticleInfoApi is standalone logic for the Article Info tool. These methods perform SQL queries
16
 * or make API requests and can be called directly, without any knowledge of the child ArticleInfo class.
17
 * It does require that the ArticleInfoRepository be set, however.
18
 * @see ArticleInfo
19
 */
20
class ArticleInfoApi extends Model
21
{
22
    /** @var ContainerInterface The application's DI container. */
23
    protected $container;
24
25
    /** @var int Number of revisions that belong to the page. */
26
    protected $numRevisions;
27
28
    /** @var mixed[] Prose stats, with keys 'characters', 'words', 'references', 'unique_references', 'sections'. */
29
    protected $proseStats;
30
31
    /** @var array Number of categories, templates and files on the page. */
32
    protected $transclusionData;
33
34
    /** @var mixed[] Various statistics about bots that edited the page. */
35
    protected $bots;
36
37
    /** @var int Number of edits made to the page by bots. */
38
    protected $botRevisionCount;
39
40
    /** @var int[] Number of in and outgoing links and redirects to the page. */
41
    protected $linksAndRedirects;
42
43
    /** @var string[] Assessments of the page (see Page::getAssessments). */
44
    protected $assessments;
45
46
    /** @var string[] List of Wikidata and Checkwiki errors. */
47
    protected $bugs;
48
49
    /**
50
     * ArticleInfoApi constructor.
51
     * @param Page $page The page to process.
52
     * @param ContainerInterface $container The DI container.
53
     * @param false|int $start Start date as Unix timestmap.
54
     * @param false|int $end End date as Unix timestamp.
55
     */
56 12
    public function __construct(Page $page, ContainerInterface $container, $start = false, $end = false)
57
    {
58 12
        $this->page = $page;
59 12
        $this->container = $container;
60 12
        $this->start = $start;
61 12
        $this->end = $end;
62 12
    }
63
64
    /**
65
     * Get the number of revisions belonging to the page.
66
     * @return int
67
     */
68 4
    public function getNumRevisions(): int
69
    {
70 4
        if (!isset($this->numRevisions)) {
71 4
            $this->numRevisions = $this->page->getNumRevisions(null, $this->start, $this->end);
72
        }
73 4
        return $this->numRevisions;
74
    }
75
76
    /**
77
     * Get various basic info used in the API, including the number of revisions, unique authors, initial author
78
     * and edit count of the initial author. This is combined into one query for better performance. Caching is
79
     * intentionally disabled, because using the gadget, this will get hit for a different page constantly, where
80
     * the likelihood of cache benefiting us is slim.
81
     * @return string[]|false false if the page was not found.
82
     */
83
    public function getBasicEditingInfo()
84
    {
85
        return $this->getRepository()->getBasicEditingInfo($this->page);
0 ignored issues
show
Bug introduced by
The method getBasicEditingInfo() does not exist on AppBundle\Repository\Repository. It seems like you code against a sub-type of AppBundle\Repository\Repository such as AppBundle\Repository\ArticleInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

85
        return $this->getRepository()->/** @scrutinizer ignore-call */ getBasicEditingInfo($this->page);
Loading history...
86
    }
87
88
    /**
89
     * Get the top editors to the page by edit count.
90
     * @param int $limit Default 20, maximum 1,000.
91
     * @param bool $noBots Set to non-false to exclude bots from the result.
92
     * @return array
93
     */
94
    public function getTopEditorsByEditCount(int $limit = 20, bool $noBots = false): array
95
    {
96
        // Quick cache, valid only for the same request.
97
        static $topEditors = null;
98
        if (null !== $topEditors) {
99
            return $topEditors;
100
        }
101
102
        $rows = $this->getRepository()->getTopEditorsByEditCount(
0 ignored issues
show
Bug introduced by
The method getTopEditorsByEditCount() does not exist on AppBundle\Repository\Repository. It seems like you code against a sub-type of AppBundle\Repository\Repository such as AppBundle\Repository\ArticleInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

102
        $rows = $this->getRepository()->/** @scrutinizer ignore-call */ getTopEditorsByEditCount(
Loading history...
103
            $this->page,
104
            $this->start,
105
            $this->end,
106
            min($limit, 1000),
107
            $noBots
108
        );
109
110
        $topEditors = [];
111
        $rank = 0;
112
        foreach ($rows as $row) {
113
            $topEditors[] = [
114
                'rank' => ++$rank,
115
                'username' => $row['username'],
116
                'count' => $row['count'],
117
                'minor' => $row['minor'],
118
                'first_edit' => [
119
                    'id' => $row['first_revid'],
120
                    'timestamp' => $row['first_timestamp'],
121
                ],
122
                'latest_edit' => [
123
                    'id' => $row['latest_revid'],
124
                    'timestamp' => $row['latest_timestamp'],
125
                ],
126
            ];
127
        }
128
129
        return $topEditors;
130
    }
131
132
    /**
133
     * Get prose and reference information.
134
     * @return array With keys 'characters', 'words', 'references', 'unique_references'
135
     */
136 1
    public function getProseStats(): array
137
    {
138 1
        if (isset($this->proseStats)) {
139
            return $this->proseStats;
140
        }
141
142 1
        $datetime = is_int($this->end) ? new DateTime("@{$this->end}") : null;
143 1
        $html = $this->page->getHTMLContent($datetime);
144
145 1
        $crawler = new Crawler($html);
146
147 1
        [$chars, $words] = $this->countCharsAndWords($crawler, '#mw-content-text p');
148
149 1
        $refs = $crawler->filter('#mw-content-text .reference');
150 1
        $refContent = [];
151
        $refs->each(function ($ref) use (&$refContent): void {
152 1
            $refContent[] = $ref->text();
153 1
        });
154 1
        $uniqueRefs = count(array_unique($refContent));
155
156 1
        $sections = count($crawler->filter('#mw-content-text .mw-headline'));
157
158 1
        $this->proseStats = [
159 1
            'characters' => $chars,
160 1
            'words' => $words,
161 1
            'references' => $refs->count(),
162 1
            'unique_references' => $uniqueRefs,
163 1
            'sections' => $sections,
164
        ];
165 1
        return $this->proseStats;
166
    }
167
168
    /**
169
     * Count the number of characters and words of the plain text within the DOM element matched by the given selector.
170
     * @param Crawler $crawler
171
     * @param string $selector HTML selector.
172
     * @return array [num chars, num words]
173
     */
174 1
    private function countCharsAndWords(Crawler $crawler, string $selector): array
175
    {
176 1
        $totalChars = 0;
177 1
        $totalWords = 0;
178 1
        $paragraphs = $crawler->filter($selector);
179
        $paragraphs->each(function ($node) use (&$totalChars, &$totalWords): void {
180
            /** @var Crawler $node */
181 1
            $text = preg_replace('/\[\d+]/', '', trim($node->text(null, true)));
182 1
            $totalChars += strlen($text);
183 1
            $totalWords += count(explode(' ', $text));
184 1
        });
185
186 1
        return [$totalChars, $totalWords];
187
    }
188
189
    /**
190
     * Get the page assessments of the page.
191
     * @see https://www.mediawiki.org/wiki/Extension:PageAssessments
192
     * @return string[]|false False if unsupported.
193
     * @codeCoverageIgnore
194
     */
195
    public function getAssessments()
196
    {
197
        if (!is_array($this->assessments)) {
0 ignored issues
show
introduced by
The condition is_array($this->assessments) is always true.
Loading history...
198
            $this->assessments = $this->page
199
                ->getProject()
200
                ->getPageAssessments()
201
                ->getAssessments($this->page);
202
        }
203
        return $this->assessments;
204
    }
205
206
    /**
207
     * Get the list of page's wikidata and Checkwiki errors.
208
     * @see Page::getErrors()
209
     * @return string[]
210
     */
211
    public function getBugs(): array
212
    {
213
        if (!is_array($this->bugs)) {
0 ignored issues
show
introduced by
The condition is_array($this->bugs) is always true.
Loading history...
214
            $this->bugs = $this->page->getErrors();
215
        }
216
        return $this->bugs;
217
    }
218
219
    /**
220
     * Get the number of wikidata nad CheckWiki errors.
221
     * @return int
222
     */
223
    public function numBugs(): int
224
    {
225
        return count($this->getBugs());
226
    }
227
228
    /**
229
     * Generate the data structure that will used in the ArticleInfo API response.
230
     * @param Project $project
231
     * @param Page $page
232
     * @return array
233
     * @codeCoverageIgnore
234
     */
235
    public function getArticleInfoApiData(Project $project, Page $page): array
236
    {
237
        /** @var int $pageviewsOffset Number of days to query for pageviews */
238
        $pageviewsOffset = 30;
239
240
        $data = [
241
            'project' => $project->getDomain(),
242
            'page' => $page->getTitle(),
243
            'watchers' => (int) $page->getWatchers(),
244
            'pageviews' => $page->getLastPageviews($pageviewsOffset),
245
            'pageviews_offset' => $pageviewsOffset,
246
        ];
247
248
        $info = false;
249
250
        try {
251
            $articleInfoRepo = new ArticleInfoRepository();
252
            $articleInfoRepo->setContainer($this->container);
253
            $info = $articleInfoRepo->getBasicEditingInfo($page);
254
        } catch (ServiceUnavailableHttpException $e) {
255
            // No more open database connections.
256
            $data['error'] = 'Unable to fetch revision data. Please try again later.';
257
        } catch (HttpException $e) {
258
            /**
259
             * The query most likely exceeded the maximum query time,
260
             * so we'll abort and give only info retrieved by the API.
261
             */
262
            $data['error'] = 'Unable to fetch revision data. The query may have timed out.';
263
        }
264
265
        if (false !== $info) {
266
            $creationDateTime = DateTime::createFromFormat('YmdHis', $info['created_at']);
267
            $modifiedDateTime = DateTime::createFromFormat('YmdHis', $info['modified_at']);
268
            $secsSinceLastEdit = (new DateTime)->getTimestamp() - $modifiedDateTime->getTimestamp();
269
270
            // Some wikis (such foundation.wikimedia.org) may be missing the creation date.
271
            $creationDateTime = false === $creationDateTime
272
                ? null
273
                : $creationDateTime->format('Y-m-d');
274
275
            $assessment = $page->getProject()
276
                ->getPageAssessments()
277
                ->getAssessment($page);
278
279
            $data = array_merge($data, [
280
                'revisions' => (int) $info['num_edits'],
281
                'editors' => (int) $info['num_editors'],
282
                'minor_edits' => (int) $info['minor_edits'],
283
                'author' => $info['author'],
284
                'author_editcount' => (int) $info['author_editcount'],
285
                'created_at' => $creationDateTime,
286
                'created_rev_id' => $info['created_rev_id'],
287
                'modified_at' => $modifiedDateTime->format('Y-m-d H:i'),
288
                'secs_since_last_edit' => $secsSinceLastEdit,
289
                'last_edit_id' => (int) $info['modified_rev_id'],
290
                'assessment' => $assessment,
291
            ]);
292
        }
293
294
        return $data;
295
    }
296
297
    /************************ Link statistics ************************/
298
299
    /**
300
     * Get the number of external links on the page.
301
     * @return int
302
     */
303 1
    public function linksExtCount(): int
304
    {
305 1
        return $this->getLinksAndRedirects()['links_ext_count'];
306
    }
307
308
    /**
309
     * Get the number of incoming links to the page.
310
     * @return int
311
     */
312 1
    public function linksInCount(): int
313
    {
314 1
        return $this->getLinksAndRedirects()['links_in_count'];
315
    }
316
317
    /**
318
     * Get the number of outgoing links from the page.
319
     * @return int
320
     */
321 1
    public function linksOutCount(): int
322
    {
323 1
        return $this->getLinksAndRedirects()['links_out_count'];
324
    }
325
326
    /**
327
     * Get the number of redirects to the page.
328
     * @return int
329
     */
330 1
    public function redirectsCount(): int
331
    {
332 1
        return $this->getLinksAndRedirects()['redirects_count'];
333
    }
334
335
    /**
336
     * Get the number of external, incoming and outgoing links, along with the number of redirects to the page.
337
     * @return int[]
338
     * @codeCoverageIgnore
339
     */
340
    private function getLinksAndRedirects(): array
341
    {
342
        if (!is_array($this->linksAndRedirects)) {
0 ignored issues
show
introduced by
The condition is_array($this->linksAndRedirects) is always true.
Loading history...
343
            $this->linksAndRedirects = $this->page->countLinksAndRedirects();
344
        }
345
        return $this->linksAndRedirects;
346
    }
347
348
    /**
349
     * Fetch transclusion data (categories, templates and files) that are on the page.
350
     * @return array With keys 'categories', 'templates' and 'files'.
351
     */
352 1
    public function getTransclusionData(): array
353
    {
354 1
        if (!is_array($this->transclusionData)) {
0 ignored issues
show
introduced by
The condition is_array($this->transclusionData) is always true.
Loading history...
355 1
            $this->transclusionData = $this->getRepository()
356 1
                ->getTransclusionData($this->page);
357
        }
358 1
        return $this->transclusionData;
359
    }
360
361
    /**
362
     * Get the number of categories that are on the page.
363
     * @return int
364
     */
365 1
    public function getNumCategories(): int
366
    {
367 1
        return $this->getTransclusionData()['categories'];
368
    }
369
370
    /**
371
     * Get the number of templates that are on the page.
372
     * @return int
373
     */
374 1
    public function getNumTemplates(): int
375
    {
376 1
        return $this->getTransclusionData()['templates'];
377
    }
378
379
    /**
380
     * Get the number of files that are on the page.
381
     * @return int
382
     */
383 1
    public function getNumFiles(): int
384
    {
385 1
        return $this->getTransclusionData()['files'];
386
    }
387
388
    /************************ Bot statistics ************************/
389
390
    /**
391
     * Number of edits made to the page by current or former bots.
392
     * @param string[] $bots Used only in unit tests, where we supply mock data for the bots that will get processed.
393
     * @return int
394
     */
395 2
    public function getBotRevisionCount(?array $bots = null): int
396
    {
397 2
        if (isset($this->botRevisionCount)) {
398
            return $this->botRevisionCount;
399
        }
400
401 2
        if (null === $bots) {
402 1
            $bots = $this->getBots();
403
        }
404
405 2
        $count = 0;
406
407 2
        foreach (array_values($bots) as $data) {
408 2
            $count += $data['count'];
409
        }
410
411 2
        $this->botRevisionCount = $count;
412 2
        return $count;
413
    }
414
415
    /**
416
     * Get and set $this->bots about bots that edited the page. This is done as a private setter because we need
417
     * this information when computing the top 10 editors in ArticleInfo, where we don't want to include bots.
418
     * @return mixed[]
419
     */
420 1
    public function getBots(): array
421
    {
422 1
        if (isset($this->bots)) {
423 1
            return $this->bots;
424
        }
425
426
        // Parse the bot edits.
427
        $this->bots = [];
428
429
        /** @var Statement $botData */
430
        $botData = $this->getRepository()->getBotData($this->page, $this->start, $this->end);
0 ignored issues
show
Bug introduced by
The method getBotData() does not exist on AppBundle\Repository\Repository. It seems like you code against a sub-type of AppBundle\Repository\Repository such as AppBundle\Repository\ArticleInfoRepository. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

430
        $botData = $this->getRepository()->/** @scrutinizer ignore-call */ getBotData($this->page, $this->start, $this->end);
Loading history...
431
        while ($bot = $botData->fetch()) {
432
            $this->bots[$bot['username']] = [
433
                'count' => (int)$bot['count'],
434
                'current' => '1' === $bot['current'],
435
            ];
436
        }
437
438
        // Sort by edit count.
439
        uasort($this->bots, function ($a, $b) {
440
            return $b['count'] - $a['count'];
441
        });
442
443
        return $this->bots;
444
    }
445
446
    /**
447
     * Get the number of bots that edited the page.
448
     * @return int
449
     */
450
    public function getNumBots(): int
451
    {
452
        return count($this->getBots());
453
    }
454
}
455