Passed
Pull Request — master (#157)
by MusikAnimal
05:00
created

ArticleInfo::countTokens()   B

Complexity

Conditions 4
Paths 5

Size

Total Lines 29
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 4

Importance

Changes 0
Metric Value
cc 4
eloc 15
nc 5
nop 1
dl 0
loc 29
ccs 15
cts 15
cp 1
crap 4
rs 8.5806
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file contains only the ArticleInfo class.
4
 */
5
6
namespace Xtools;
7
8
use Symfony\Component\DependencyInjection\Container;
9
use Symfony\Component\DomCrawler\Crawler;
10
use DateTime;
11
12
/**
13
 * An ArticleInfo provides statistics about a page on a project. This model does not
14
 * have a separate Repository because it needs to use individual SQL statements to
15
 * traverse the page's history, saving class instance variables along the way.
16
 */
17
class ArticleInfo extends Model
18
{
19
    /** @const string[] Domain names of wikis supported by WikiWho. */
20
    const TEXTSHARE_WIKIS = [
21
        'en.wikipedia.org',
22
        'de.wikipedia.org',
23
        'eu.wikipedia.org',
24
        'tr.wikipedia.org',
25
        'es.wikipedia.org',
26
    ];
27
28
    /** @var Container The application's DI container. */
29
    protected $container;
30
31
    /** @var Page The page. */
32
    protected $page;
33
34
    /** @var false|int From what date to obtain records. */
35
    protected $startDate;
36
37
    /** @var false|int To what date to obtain records. */
38
    protected $endDate;
39
40
    /** @var int Number of revisions that belong to the page. */
41
    protected $numRevisions;
42
43
    /** @var int Maximum number of revisions to process, as configured. */
44
    protected $maxRevisions;
45
46
    /** @var int Number of revisions that were actually processed. */
47
    protected $numRevisionsProcessed;
48
49
    /**
50
     * Various statistics about editors to the page. These are not User objects
51
     * so as to preserve memory.
52
     * @var mixed[]
53
     */
54
    protected $editors;
55
56
    /** @var mixed[] The top 10 editors to the page by number of edits. */
57
    protected $topTenEditorsByEdits;
58
59
    /** @var mixed[] The top 10 editors to the page by added text. */
60
    protected $topTenEditorsByAdded;
61
62
    /** @var int Number of edits made by the top 10 editors. */
63
    protected $topTenCount;
64
65
    /** @var mixed[] Various statistics about bots that edited the page. */
66
    protected $bots;
67
68
    /** @var int Number of edits made to the page by bots. */
69
    protected $botRevisionCount;
70
71
    /** @var mixed[] Various counts about each individual year and month of the page's history. */
72
    protected $yearMonthCounts;
73
74
    /** @var Edit The first edit to the page. */
75
    protected $firstEdit;
76
77
    /** @var Edit The last edit to the page. */
78
    protected $lastEdit;
79
80
    /** @var Edit Edit that made the largest addition by number of bytes. */
81
    protected $maxAddition;
82
83
    /** @var Edit Edit that made the largest deletion by number of bytes. */
84
    protected $maxDeletion;
85
86
    /** @var int[] Number of in and outgoing links and redirects to the page. */
87
    protected $linksAndRedirects;
88
89
    /** @var string[] Assessments of the page (see Page::getAssessments). */
90
    protected $assessments;
91
92
    /**
93
     * Maximum number of edits that were created across all months. This is used as a comparison
94
     * for the bar charts in the months section.
95
     * @var int
96
     */
97
    protected $maxEditsPerMonth;
98
99
    /** @var string[] List of (semi-)automated tools that were used to edit the page. */
100
    protected $tools;
101
102
    /**
103
     * Total number of bytes added throughout the page's history. This is used as a comparison
104
     * when computing the top 10 editors by added text.
105
     * @var int
106
     */
107
    protected $addedBytes = 0;
108
109
    /** @var int Number of days between first and last edit. */
110
    protected $totalDays;
111
112
    /** @var int Number of minor edits to the page. */
113
    protected $minorCount = 0;
114
115
    /** @var int Number of anonymous edits to the page. */
116
    protected $anonCount = 0;
117
118
    /** @var int Number of automated edits to the page. */
119
    protected $automatedCount = 0;
120
121
    /** @var int Number of edits to the page that were reverted with the subsequent edit. */
122
    protected $revertCount = 0;
123
124
    /** @var int[] The "edits per <time>" counts. */
125
    protected $countHistory = [
126
        'day' => 0,
127
        'week' => 0,
128
        'month' => 0,
129
        'year' => 0
130
    ];
131
132
    /** @var string[] List of wikidata and Checkwiki errors. */
133
    protected $bugs;
134
135
    /** @var array List of editors and the percentage of the current content that they authored. */
136
    protected $textshares;
137
138
    /** @var array Number of categories, templates and files on the page. */
139
    protected $transclusionData;
140
141
    /**
142
     * ArticleInfo constructor.
143
     * @param Page $page The page to process.
144
     * @param Container $container The DI container.
145
     * @param false|int $start From what date to obtain records.
146
     * @param false|int $end To what date to obtain records.
147
     */
148 13
    public function __construct(Page $page, Container $container, $start = false, $end = false)
149
    {
150 13
        $this->page = $page;
151 13
        $this->container = $container;
152 13
        $this->startDate = $start;
153 13
        $this->endDate = $end;
154 13
    }
155
156
    /**
157
     * Get date opening date range.
158
     * @return false|int
159
     */
160 1
    public function getStartDate()
161
    {
162 1
        return $this->startDate;
163
    }
164
165
    /**
166
     * Get date closing date range.
167
     * @return false|int
168
     */
169 1
    public function getEndDate()
170
    {
171 1
        return $this->endDate;
172
    }
173
174
    /**
175
     * Get the day of last date we should show in the month/year sections,
176
     * based on $this->endDate or the current date.
177
     * @return int As Unix timestamp.
178
     */
179 4
    private function getLastDay()
180
    {
181 4
        if ($this->endDate !== false) {
182
            return (new DateTime('@'.$this->endDate))
183
                ->modify('last day of this month')
184
                ->getTimestamp();
185
        } else {
186 4
            return strtotime('last day of this month');
187
        }
188
    }
189
190
    /**
191
     * Has date range?
192
     * @return bool
193
     */
194 1
    public function hasDateRange()
195
    {
196 1
        return $this->startDate !== false || $this->endDate !== false;
197
    }
198
199
    /**
200
     * Return the start/end date values as associative array,
201
     * with YYYY-MM-DD as the date format. This is used mainly as
202
     * a helper to pass to the pageviews Twig macros.
203
     * @return array
204
     */
205 1
    public function getDateParams()
206
    {
207 1
        if (!$this->hasDateRange()) {
208
            return [];
209
        }
210
211
        $ret = [
212 1
            'start' => $this->firstEdit->getTimestamp()->format('Y-m-d'),
213 1
            'end' => $this->lastEdit->getTimestamp()->format('Y-m-d'),
214
        ];
215
216 1
        if ($this->startDate !== false) {
217 1
            $ret['start'] = date('Y-m-d', $this->startDate);
218
        }
219 1
        if ($this->endDate !== false) {
220 1
            $ret['end'] = date('Y-m-d', $this->endDate);
221
        }
222
223 1
        return $ret;
224
    }
225
226
    /**
227
     * Shorthand to get the page's project.
228
     * @return Project
229
     * @codeCoverageIgnore
230
     */
231
    public function getProject()
232
    {
233
        return $this->page->getProject();
234
    }
235
236
    /**
237
     * Get the number of revisions belonging to the page.
238
     * @return int
239
     */
240 4
    public function getNumRevisions()
241
    {
242 4
        if (!isset($this->numRevisions)) {
243 4
            $this->numRevisions = $this->page->getNumRevisions(null, $this->startDate, $this->endDate);
244
        }
245 4
        return $this->numRevisions;
246
    }
247
248
    /**
249
     * Get the maximum number of revisions that we should process.
250
     * @return int
251
     */
252 3
    public function getMaxRevisions()
253
    {
254 3
        if (!isset($this->maxRevisions)) {
255 3
            $this->maxRevisions = (int) $this->container->getParameter('app.max_page_revisions');
256
        }
257 3
        return $this->maxRevisions;
258
    }
259
260
    /**
261
     * Get the number of revisions that are actually getting processed.
262
     * This goes by the app.max_page_revisions parameter, or the actual
263
     * number of revisions, whichever is smaller.
264
     * @return int
265
     */
266 6
    public function getNumRevisionsProcessed()
267
    {
268 6
        if (isset($this->numRevisionsProcessed)) {
269 4
            return $this->numRevisionsProcessed;
270
        }
271
272 2
        if ($this->tooManyRevisions()) {
273 1
            $this->numRevisionsProcessed = $this->getMaxRevisions();
274
        } else {
275 1
            $this->numRevisionsProcessed = $this->getNumRevisions();
276
        }
277
278 2
        return $this->numRevisionsProcessed;
279
    }
280
281
    /**
282
     * Are there more revisions than we should process, based on the config?
283
     * @return bool
284
     */
285 3
    public function tooManyRevisions()
286
    {
287 3
        return $this->getMaxRevisions() > 0 && $this->getNumRevisions() > $this->getMaxRevisions();
288
    }
289
290
    /**
291
     * Fetch and store all the data we need to show the ArticleInfo view.
292
     * @codeCoverageIgnore
293
     */
294
    public function prepareData()
295
    {
296
        $this->parseHistory();
297
        $this->setLogsEvents();
298
299
        // Bots need to be set before setting top 10 counts.
300
        $this->setBots();
301
302
        $this->setTopTenCounts();
303
    }
304
305
    /**
306
     * Get the number of editors that edited the page.
307
     * @return int
308
     */
309 1
    public function getNumEditors()
310
    {
311 1
        return count($this->editors);
312
    }
313
314
    /**
315
     * Get the number of bots that edited the page.
316
     * @return int
317
     */
318
    public function getNumBots()
319
    {
320
        return count($this->getBots());
321
    }
322
323
    /**
324
     * Get the number of days between the first and last edit.
325
     * @return int
326
     */
327 1
    public function getTotalDays()
328
    {
329 1
        if (isset($this->totalDays)) {
330 1
            return $this->totalDays;
331
        }
332 1
        $dateFirst = $this->firstEdit->getTimestamp();
333 1
        $dateLast = $this->lastEdit->getTimestamp();
334 1
        $interval = date_diff($dateLast, $dateFirst, true);
335 1
        $this->totalDays = $interval->format('%a');
0 ignored issues
show
Documentation Bug introduced by
The property $totalDays was declared of type integer, but $interval->format('%a') is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
336 1
        return $this->totalDays;
337
    }
338
339
    /**
340
     * Returns length of the page.
341
     * @return int
342
     */
343 1
    public function getLength()
344
    {
345 1
        if ($this->hasDateRange()) {
346 1
            return $this->lastEdit->getLength();
347
        }
348
349
        return $this->page->getLength();
350
    }
351
352
    /**
353
     * Get the average number of days between edits to the page.
354
     * @return double
355
     */
356 1
    public function averageDaysPerEdit()
357
    {
358 1
        return round($this->getTotalDays() / $this->getNumRevisionsProcessed(), 1);
359
    }
360
361
    /**
362
     * Get the average number of edits per day to the page.
363
     * @return double
364
     */
365 1
    public function editsPerDay()
366
    {
367 1
        $editsPerDay = $this->getTotalDays()
368 1
            ? $this->getNumRevisionsProcessed() / ($this->getTotalDays() / (365 / 12 / 24))
369 1
            : 0;
370 1
        return round($editsPerDay, 1);
371
    }
372
373
    /**
374
     * Get the average number of edits per month to the page.
375
     * @return double
376
     */
377 1
    public function editsPerMonth()
378
    {
379 1
        $editsPerMonth = $this->getTotalDays()
380 1
            ? $this->getNumRevisionsProcessed() / ($this->getTotalDays() / (365 / 12))
381 1
            : 0;
382 1
        return min($this->getNumRevisionsProcessed(), round($editsPerMonth, 1));
383
    }
384
385
    /**
386
     * Get the average number of edits per year to the page.
387
     * @return double
388
     */
389 1
    public function editsPerYear()
390
    {
391 1
        $editsPerYear = $this->getTotalDays()
392 1
            ? $this->getNumRevisionsProcessed() / ($this->getTotalDays() / 365)
393 1
            : 0;
394 1
        return min($this->getNumRevisionsProcessed(), round($editsPerYear, 1));
395
    }
396
397
    /**
398
     * Get the average number of edits per editor.
399
     * @return double
400
     */
401 1
    public function editsPerEditor()
402
    {
403 1
        return round($this->getNumRevisionsProcessed() / count($this->editors), 1);
404
    }
405
406
    /**
407
     * Get the percentage of minor edits to the page.
408
     * @return double
409
     */
410 1
    public function minorPercentage()
411
    {
412 1
        return round(
413 1
            ($this->minorCount / $this->getNumRevisionsProcessed()) * 100,
414 1
            1
415
        );
416
    }
417
418
    /**
419
     * Get the percentage of anonymous edits to the page.
420
     * @return double
421
     */
422 1
    public function anonPercentage()
423
    {
424 1
        return round(
425 1
            ($this->anonCount / $this->getNumRevisionsProcessed()) * 100,
426 1
            1
427
        );
428
    }
429
430
    /**
431
     * Get the percentage of edits made by the top 10 editors.
432
     * @return double
433
     */
434 1
    public function topTenPercentage()
435
    {
436 1
        return round(($this->topTenCount / $this->getNumRevisionsProcessed()) * 100, 1);
437
    }
438
439
    /**
440
     * Get the number of times the page has been viewed in the given timeframe.
441
     * If the ArticleInfo instance has a date range, it is used instead of the
442
     * value of the $latest parameter.
443
     * @param  int $latest Last N days.
444
     * @return int
445
     */
446
    public function getPageviews($latest)
447
    {
448
        if (!$this->hasDateRange()) {
449
            return $this->page->getLastPageviews($latest);
450
        }
451
452
        $daterange = $this->getDateParams();
453
        return $this->page->getPageviews($daterange['start'], $daterange['end']);
454
    }
455
456
    /**
457
     * Get the page assessments of the page.
458
     * @see https://www.mediawiki.org/wiki/Extension:PageAssessments
459
     * @return string[]|false False if unsupported.
460
     * @codeCoverageIgnore
461
     */
462
    public function getAssessments()
463
    {
464
        if (!is_array($this->assessments)) {
0 ignored issues
show
introduced by
The condition ! is_array($this->assessments) can never be true.
Loading history...
465
            $this->assessments = $this->page->getAssessments();
466
        }
467
        return $this->assessments;
468
    }
469
470
    /**
471
     * Get the number of automated edits made to the page.
472
     * @return int
473
     */
474 1
    public function getAutomatedCount()
475
    {
476 1
        return $this->automatedCount;
477
    }
478
479
    /**
480
     * Get the number of edits to the page that were reverted with the subsequent edit.
481
     * @return int
482
     */
483 1
    public function getRevertCount()
484
    {
485 1
        return $this->revertCount;
486
    }
487
488
    /**
489
     * Get the number of edits to the page made by logged out users.
490
     * @return int
491
     */
492 1
    public function getAnonCount()
493
    {
494 1
        return $this->anonCount;
495
    }
496
497
    /**
498
     * Get the number of minor edits to the page.
499
     * @return int
500
     */
501 1
    public function getMinorCount()
502
    {
503 1
        return $this->minorCount;
504
    }
505
506
    /**
507
     * Get the number of edits to the page made in the past day, week, month and year.
508
     * @return int[] With keys 'day', 'week', 'month' and 'year'.
509
     */
510
    public function getCountHistory()
511
    {
512
        return $this->countHistory;
513
    }
514
515
    /**
516
     * Get the number of edits to the page made by the top 10 editors.
517
     * @return int
518
     */
519 1
    public function getTopTenCount()
520
    {
521 1
        return $this->topTenCount;
522
    }
523
524
    /**
525
     * Get the first edit to the page.
526
     * @return Edit
527
     */
528 1
    public function getFirstEdit()
529
    {
530 1
        return $this->firstEdit;
531
    }
532
533
    /**
534
     * Get the last edit to the page.
535
     * @return Edit
536
     */
537 1
    public function getLastEdit()
538
    {
539 1
        return $this->lastEdit;
540
    }
541
542
    /**
543
     * Get the edit that made the largest addition to the page (by number of bytes).
544
     * @return Edit
545
     */
546 1
    public function getMaxAddition()
547
    {
548 1
        return $this->maxAddition;
549
    }
550
551
    /**
552
     * Get the edit that made the largest removal to the page (by number of bytes).
553
     * @return Edit
554
     */
555 1
    public function getMaxDeletion()
556
    {
557 1
        return $this->maxDeletion;
558
    }
559
560
    /**
561
     * Get the list of editors to the page, including various statistics.
562
     * @return mixed[]
563
     */
564 1
    public function getEditors()
565
    {
566 1
        return $this->editors;
567
    }
568
569
    /**
570
     * Get the list of the top editors to the page (by edits), including various statistics.
571
     * @return mixed[]
572
     */
573 1
    public function topTenEditorsByEdits()
574
    {
575 1
        return $this->topTenEditorsByEdits;
576
    }
577
578
    /**
579
     * Get the list of the top editors to the page (by added text), including various statistics.
580
     * @return mixed[]
581
     */
582 1
    public function topTenEditorsByAdded()
583
    {
584 1
        return $this->topTenEditorsByAdded;
585
    }
586
587
    /**
588
     * Get various counts about each individual year and month of the page's history.
589
     * @return mixed[]
590
     */
591 2
    public function getYearMonthCounts()
592
    {
593 2
        return $this->yearMonthCounts;
594
    }
595
596
    /**
597
     * Get the maximum number of edits that were created across all months. This is used as a
598
     * comparison for the bar charts in the months section.
599
     * @return int
600
     */
601 1
    public function getMaxEditsPerMonth()
602
    {
603 1
        return $this->maxEditsPerMonth;
604
    }
605
606
    /**
607
     * Get a list of (semi-)automated tools that were used to edit the page, including
608
     * the number of times they were used, and a link to the tool's homepage.
609
     * @return mixed[]
610
     */
611 1
    public function getTools()
612
    {
613 1
        return $this->tools;
614
    }
615
616
    /**
617
     * Get the list of page's wikidata and Checkwiki errors.
618
     * @see Page::getErrors()
619
     * @return string[]
620
     */
621
    public function getBugs()
622
    {
623
        if (!is_array($this->bugs)) {
0 ignored issues
show
introduced by
The condition ! is_array($this->bugs) can never be true.
Loading history...
624
            $this->bugs = $this->page->getErrors();
625
        }
626
        return $this->bugs;
627
    }
628
629
    /**
630
     * Get the number of wikidata nad CheckWiki errors.
631
     * @return int
632
     */
633
    public function numBugs()
634
    {
635
        return count($this->getBugs());
636
    }
637
638
    /**
639
     * Get the number of external links on the page.
640
     * @return int
641
     */
642 1
    public function linksExtCount()
643
    {
644 1
        return $this->getLinksAndRedirects()['links_ext_count'];
645
    }
646
647
    /**
648
     * Get the number of incoming links to the page.
649
     * @return int
650
     */
651 1
    public function linksInCount()
652
    {
653 1
        return $this->getLinksAndRedirects()['links_in_count'];
654
    }
655
656
    /**
657
     * Get the number of outgoing links from the page.
658
     * @return int
659
     */
660 1
    public function linksOutCount()
661
    {
662 1
        return $this->getLinksAndRedirects()['links_out_count'];
663
    }
664
665
    /**
666
     * Get the number of redirects to the page.
667
     * @return int
668
     */
669 1
    public function redirectsCount()
670
    {
671 1
        return $this->getLinksAndRedirects()['redirects_count'];
672
    }
673
674
    /**
675
     * Get the number of external, incoming and outgoing links, along with
676
     * the number of redirects to the page.
677
     * @return int
678
     * @codeCoverageIgnore
679
     */
680
    private function getLinksAndRedirects()
681
    {
682
        if (!is_array($this->linksAndRedirects)) {
0 ignored issues
show
introduced by
The condition ! is_array($this->linksAndRedirects) can never be true.
Loading history...
683
            $this->linksAndRedirects = $this->page->countLinksAndRedirects();
684
        }
685
        return $this->linksAndRedirects;
686
    }
687
688
    /**
689
     * Parse the revision history, collecting our core statistics.
690
     * @return mixed[] Associative "master" array of metadata about the page.
691
     *
692
     * Untestable because it relies on getting a PDO statement. All the important
693
     * logic lives in other methods which are tested.
694
     * @codeCoverageIgnore
695
     */
696
    private function parseHistory()
697
    {
698
        if ($this->tooManyRevisions()) {
699
            $limit = $this->getMaxRevisions();
700
        } else {
701
            $limit = null;
702
        }
703
704
        // Third parameter is ignored if $limit is null.
705
        $revStmt = $this->page->getRevisionsStmt(
706
            null,
707
            $limit,
708
            $this->getNumRevisions(),
709
            $this->startDate,
710
            $this->endDate
711
        );
712
        $revCount = 0;
713
714
        /**
715
         * Data about previous edits so that we can use them as a basis for comparison.
716
         * @var Edit[]
717
         */
718
        $prevEdits = [
719
            // The previous Edit, used to discount content that was reverted.
720
            'prev' => null,
721
722
            // The SHA-1 of the edit *before* the previous edit. Used for more
723
            // accruate revert detection.
724
            'prevSha' => null,
725
726
            // The last edit deemed to be the max addition of content. This is kept track of
727
            // in case we find out the next edit was reverted (and was also a max edit),
728
            // in which case we'll want to discount it and use this one instead.
729
            'maxAddition' => null,
730
731
            // Same as with maxAddition, except the maximum amount of content deleted.
732
            // This is used to discount content that was reverted.
733
            'maxDeletion' => null,
734
        ];
735
736
        while ($rev = $revStmt->fetch()) {
737
            $edit = new Edit($this->page, $rev);
738
739
            if ($revCount === 0) {
0 ignored issues
show
introduced by
The condition $revCount === 0 can never be false.
Loading history...
740
                $this->firstEdit = $edit;
741
            }
742
743
            // Sometimes, with old revisions (2001 era), the revisions from 2002 come before 2001
744
            if ($edit->getTimestamp() < $this->firstEdit->getTimestamp()) {
745
                $this->firstEdit = $edit;
746
            }
747
748
            $prevEdits = $this->updateCounts($edit, $prevEdits);
749
750
            $revCount++;
751
        }
752
753
        $this->numRevisionsProcessed = $revCount;
754
755
        // Various sorts
756
        arsort($this->editors);
757
        ksort($this->yearMonthCounts);
758
        if ($this->tools) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->tools of type string[] is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
759
            arsort($this->tools);
760
        }
761
    }
762
763
    /**
764
     * Update various counts based on the current edit.
765
     * @param  Edit   $edit
766
     * @param  Edit[] $prevEdits With 'prev', 'prevSha', 'maxAddition' and 'maxDeletion'
767
     * @return Edit[] Updated version of $prevEdits.
768
     */
769 4
    private function updateCounts(Edit $edit, $prevEdits)
770
    {
771
        // Update the counts for the year and month of the current edit.
772 4
        $this->updateYearMonthCounts($edit);
773
774
        // Update counts for the user who made the edit.
775 4
        $this->updateUserCounts($edit);
776
777
        // Update the year/month/user counts of anon and minor edits.
778 4
        $this->updateAnonMinorCounts($edit);
779
780
        // Update counts for automated tool usage, if applicable.
781 4
        $this->updateToolCounts($edit);
782
783
        // Increment "edits per <time>" counts
784 4
        $this->updateCountHistory($edit);
785
786
        // Update figures regarding content addition/removal, and the revert count.
787 4
        $prevEdits = $this->updateContentSizes($edit, $prevEdits);
788
789
        // Now that we've updated all the counts, we can reset
790
        // the prev and last edits, which are used for tracking.
791
        // But first, let's copy over the SHA of the actual previous edit
792
        // and put it in our $prevEdits['prev'], so that we'll know
793
        // that content added after $prevEdit['prev'] was reverted.
794 4
        if ($prevEdits['prev'] !== null) {
795 4
            $prevEdits['prevSha'] = $prevEdits['prev']->getSha();
796
        }
797 4
        $prevEdits['prev'] = $edit;
798 4
        $this->lastEdit = $edit;
799
800 4
        return $prevEdits;
801
    }
802
803
    /**
804
     * Update various figures about content sizes based on the given edit.
805
     * @param  Edit   $edit
806
     * @param  Edit[] $prevEdits With 'prev', 'prevSha', 'maxAddition' and 'maxDeletion'.
807
     * @return Edit[] Updated version of $prevEdits.
808
     */
809 4
    private function updateContentSizes(Edit $edit, $prevEdits)
810
    {
811
        // Check if it was a revert
812 4
        if ($this->isRevert($prevEdits, $edit)) {
813 4
            return $this->updateContentSizesRevert($prevEdits);
814
        } else {
815 4
            return $this->updateContentSizesNonRevert($edit, $prevEdits);
816
        }
817
    }
818
819
    /**
820
     * Is the given Edit a revert?
821
     * @param  Edit[] $prevEdits With 'prev', 'prevSha', 'maxAddition' and 'maxDeletion'.
822
     * @param  Edit $edit
823
     * @return bool
824
     */
825 4
    private function isRevert($prevEdits, $edit)
826
    {
827 4
        return $edit->getSha() === $prevEdits['prevSha'] || $edit->isRevert($this->container);
828
    }
829
830
    /**
831
     * Updates the figures on content sizes assuming the given edit was a revert of the previous one.
832
     * In such a case, we don't want to treat the previous edit as legit content addition or removal.
833
     * @param  Edit[] $prevEdits With 'prev', 'prevSha', 'maxAddition' and 'maxDeletion'.
834
     * @return Edit[] Updated version of $prevEdits, for tracking.
835
     */
836 4
    private function updateContentSizesRevert($prevEdits)
837
    {
838 4
        $this->revertCount++;
839
840
        // Adjust addedBytes given this edit was a revert of the previous one.
841 4
        if ($prevEdits['prev'] && $prevEdits['prev']->getSize() > 0) {
842
            $this->addedBytes -= $prevEdits['prev']->getSize();
843
844
            // Also deduct from the user's individual added byte count.
845
            $username = $prevEdits['prev']->getUser()->getUsername();
846
            $this->editors[$username]['added'] -= $prevEdits['prev']->getSize();
847
        }
848
849
        // @TODO: Test this against an edit war (use your sandbox).
850
        // Also remove as max added or deleted, if applicable.
851 4
        if ($this->maxAddition && $prevEdits['prev']->getId() === $this->maxAddition->getId()) {
852
            // $this->editors[$prevEdits->getUser()->getUsername()]['sizes'] = $edit->getLength() / 1024;
853
            $this->maxAddition = $prevEdits['maxAddition'];
854
            $prevEdits['maxAddition'] = $prevEdits['prev']; // In the event of edit wars.
855 4
        } elseif ($this->maxDeletion && $prevEdits['prev']->getId() === $this->maxDeletion->getId()) {
856 4
            $this->maxDeletion = $prevEdits['maxDeletion'];
857 4
            $prevEdits['maxDeletion'] = $prevEdits['prev']; // In the event of edit wars.
858
        }
859
860 4
        return $prevEdits;
861
    }
862
863
    /**
864
     * Updates the figures on content sizes assuming the given edit
865
     * was NOT a revert of the previous edit.
866
     * @param  Edit   $edit
867
     * @param  Edit[] $prevEdits With 'prev', 'prevSha', 'maxAddition' and 'maxDeletion'.
868
     * @return Edit[] Updated version of $prevEdits, for tracking.
869
     */
870 4
    private function updateContentSizesNonRevert(Edit $edit, $prevEdits)
871
    {
872 4
        $editSize = $this->getEditSize($edit, $prevEdits);
873
874
        // Edit was not a revert, so treat size > 0 as content added.
875 4
        if ($editSize > 0) {
876 4
            $this->addedBytes += $editSize;
877 4
            $this->editors[$edit->getUser()->getUsername()]['added'] += $editSize;
878
879
            // Keep track of edit with max addition.
880 4
            if (!$this->maxAddition || $editSize > $this->maxAddition->getSize()) {
881
                // Keep track of old maxAddition in case we find out the next $edit was reverted
882
                // (and was also a max edit), in which case we'll want to use this one ($edit).
883 4
                $prevEdits['maxAddition'] = $this->maxAddition;
884
885 4
                $this->maxAddition = $edit;
886
            }
887 4
        } elseif ($editSize < 0 && (!$this->maxDeletion || $editSize < $this->maxDeletion->getSize())) {
888
            // Keep track of old maxDeletion in case we find out the next edit was reverted
889
            // (and was also a max deletion), in which case we'll want to use this one.
890 4
            $prevEdits['maxDeletion'] = $this->maxDeletion;
891
892 4
            $this->maxDeletion = $edit;
893
        }
894
895 4
        return $prevEdits;
896
    }
897
898
    /**
899
     * Get the size of the given edit, based on the previous edit (if present).
900
     * We also don't return the actual edit size if last revision had a length of null.
901
     * This happens when the edit follows other edits that were revision-deleted.
902
     * @see T148857 for more information.
903
     * @todo Remove once T101631 is resolved.
904
     * @param  Edit   $edit
905
     * @param  Edit[] $prevEdits With 'prev', 'prevSha', 'maxAddition' and 'maxDeletion'.
906
     * @return Edit[] Updated version of $prevEdits, for tracking.
907
     */
908 4
    private function getEditSize(Edit $edit, $prevEdits)
909
    {
910 4
        if ($prevEdits['prev'] && $prevEdits['prev']->getLength() === null) {
0 ignored issues
show
introduced by
The condition $prevEdits['prev'] && $p...]->getLength() === null can never be true.
Loading history...
911
            return 0;
912
        } else {
913 4
            return $edit->getSize();
914
        }
915
    }
916
917
    /**
918
     * Update counts of automated tool usage for the given edit.
919
     * @param Edit $edit
920
     */
921 4
    private function updateToolCounts(Edit $edit)
922
    {
923 4
        $automatedTool = $edit->getTool($this->container);
924
925 4
        if ($automatedTool === false) {
926
            // Nothing to do.
927 4
            return;
928
        }
929
930 4
        $editYear = $edit->getYear();
931 4
        $editMonth = $edit->getMonth();
932
933 4
        $this->automatedCount++;
934 4
        $this->yearMonthCounts[$editYear]['automated']++;
935 4
        $this->yearMonthCounts[$editYear]['months'][$editMonth]['automated']++;
936
937 4
        if (!isset($this->tools[$automatedTool['name']])) {
938 4
            $this->tools[$automatedTool['name']] = [
939 4
                'count' => 1,
940 4
                'link' => $automatedTool['link'],
941
            ];
942
        } else {
943
            $this->tools[$automatedTool['name']]['count']++;
944
        }
945 4
    }
946
947
    /**
948
     * Update various counts for the year and month of the given edit.
949
     * @param Edit $edit
950
     */
951 4
    private function updateYearMonthCounts(Edit $edit)
952
    {
953 4
        $editYear = $edit->getYear();
954 4
        $editMonth = $edit->getMonth();
955
956
        // Fill in the blank arrays for the year and 12 months if needed.
957 4
        if (!isset($this->yearMonthCounts[$editYear])) {
958 4
            $this->addYearMonthCountEntry($edit);
959
        }
960
961
        // Increment year and month counts for all edits
962 4
        $this->yearMonthCounts[$editYear]['all']++;
963 4
        $this->yearMonthCounts[$editYear]['months'][$editMonth]['all']++;
964
        // This will ultimately be the size of the page by the end of the year
965 4
        $this->yearMonthCounts[$editYear]['size'] = (int) $edit->getLength();
966
967
        // Keep track of which month had the most edits
968 4
        $editsThisMonth = $this->yearMonthCounts[$editYear]['months'][$editMonth]['all'];
969 4
        if ($editsThisMonth > $this->maxEditsPerMonth) {
970 4
            $this->maxEditsPerMonth = $editsThisMonth;
971
        }
972 4
    }
973
974
    /**
975
     * Add a new entry to $this->yearMonthCounts for the given year,
976
     * with blank values for each month. This called during self::parseHistory().
977
     * @param Edit $edit
978
     */
979 4
    private function addYearMonthCountEntry(Edit $edit)
980
    {
981 4
        $editYear = $edit->getYear();
982
983
        // Beginning of the month at 00:00:00.
984 4
        $firstEditTime = mktime(0, 0, 0, (int) $this->firstEdit->getMonth(), 1, $this->firstEdit->getYear());
0 ignored issues
show
Bug introduced by
$this->firstEdit->getYear() of type string is incompatible with the type integer expected by parameter $year of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

984
        $firstEditTime = mktime(0, 0, 0, (int) $this->firstEdit->getMonth(), 1, /** @scrutinizer ignore-type */ $this->firstEdit->getYear());
Loading history...
985
986 4
        $this->yearMonthCounts[$editYear] = [
987
            'all' => 0,
988
            'minor' => 0,
989
            'anon' => 0,
990
            'automated' => 0,
991
            'size' => 0, // Keep track of the size by the end of the year.
992
            'events' => [],
993
            'months' => [],
994
        ];
995
996 4
        for ($i = 1; $i <= 12; $i++) {
997 4
            $timeObj = mktime(0, 0, 0, $i, 1, $editYear);
998
999
            // Don't show zeros for months before the first edit or after the current month.
1000 4
            if ($timeObj < $firstEditTime || $timeObj > $this->getLastDay()) {
1001 4
                continue;
1002
            }
1003
1004 4
            $this->yearMonthCounts[$editYear]['months'][sprintf('%02d', $i)] = [
1005
                'all' => 0,
1006
                'minor' => 0,
1007
                'anon' => 0,
1008
                'automated' => 0,
1009
            ];
1010
        }
1011 4
    }
1012
1013
    /**
1014
     * Update the counts of anon and minor edits for year, month,
1015
     * and user of the given edit.
1016
     * @param Edit $edit
1017
     */
1018 4
    private function updateAnonMinorCounts(Edit $edit)
1019
    {
1020 4
        $editYear = $edit->getYear();
1021 4
        $editMonth = $edit->getMonth();
1022
1023
        // If anonymous, increase counts
1024 4
        if ($edit->isAnon()) {
1025 4
            $this->anonCount++;
1026 4
            $this->yearMonthCounts[$editYear]['anon']++;
1027 4
            $this->yearMonthCounts[$editYear]['months'][$editMonth]['anon']++;
1028
        }
1029
1030
        // If minor edit, increase counts
1031 4
        if ($edit->isMinor()) {
1032 4
            $this->minorCount++;
1033 4
            $this->yearMonthCounts[$editYear]['minor']++;
1034 4
            $this->yearMonthCounts[$editYear]['months'][$editMonth]['minor']++;
1035
        }
1036 4
    }
1037
1038
    /**
1039
     * Update various counts for the user of the given edit.
1040
     * @param Edit $edit
1041
     */
1042 4
    private function updateUserCounts(Edit $edit)
1043
    {
1044 4
        $username = $edit->getUser()->getUsername();
1045
1046
        // Initialize various user stats if needed.
1047 4
        if (!isset($this->editors[$username])) {
1048 4
            $this->editors[$username] = [
1049 4
                'all' => 0,
1050 4
                'minor' => 0,
1051 4
                'minorPercentage' => 0,
1052 4
                'first' => $edit->getTimestamp(),
1053 4
                'firstId' => $edit->getId(),
1054
                'last' => null,
1055
                'atbe' => null,
1056 4
                'added' => 0,
1057
                'sizes' => [],
1058
            ];
1059
        }
1060
1061
        // Increment user counts
1062 4
        $this->editors[$username]['all']++;
1063 4
        $this->editors[$username]['last'] = $edit->getTimestamp();
1064 4
        $this->editors[$username]['lastId'] = $edit->getId();
1065
1066
        // Store number of KB added with this edit
1067 4
        $this->editors[$username]['sizes'][] = $edit->getLength() / 1024;
1068
1069
        // Increment minor counts for this user
1070 4
        if ($edit->isMinor()) {
1071 4
            $this->editors[$username]['minor']++;
1072
        }
1073 4
    }
1074
1075
    /**
1076
     * Increment "edits per <time>" counts based on the given edit.
1077
     * @param Edit $edit
1078
     */
1079 4
    private function updateCountHistory(Edit $edit)
1080
    {
1081 4
        $editTimestamp = $edit->getTimestamp();
1082
1083 4
        if ($editTimestamp > new DateTime('-1 day')) {
1084
            $this->countHistory['day']++;
1085
        }
1086 4
        if ($editTimestamp > new DateTime('-1 week')) {
1087
            $this->countHistory['week']++;
1088
        }
1089 4
        if ($editTimestamp > new DateTime('-1 month')) {
1090
            $this->countHistory['month']++;
1091
        }
1092 4
        if ($editTimestamp > new DateTime('-1 year')) {
1093
            $this->countHistory['year']++;
1094
        }
1095 4
    }
1096
1097
    /**
1098
     * Get info about bots that edited the page.
1099
     * @return mixed[] Contains the bot's username, edit count to the page,
1100
     *   and whether or not they are currently a bot.
1101
     */
1102 1
    public function getBots()
1103
    {
1104 1
        return $this->bots;
1105
    }
1106
1107
    /**
1108
     * Set info about bots that edited the page. This is done as a private setter
1109
     * because we need this information when computing the top 10 editors,
1110
     * where we don't want to include bots.
1111
     */
1112
    private function setBots()
1113
    {
1114
        // Parse the botedits
1115
        $bots = [];
1116
        $botData = $this->getRepository()->getBotData($this->page, $this->startDate, $this->endDate);
1117
        while ($bot = $botData->fetch()) {
1118
            $bots[$bot['username']] = [
1119
                'count' => (int) $bot['count'],
1120
                'current' => $bot['current'] === 'bot',
1121
            ];
1122
        }
1123
1124
        // Sort by edit count.
1125
        uasort($bots, function ($a, $b) {
1126
            return $b['count'] - $a['count'];
1127
        });
1128
1129
        $this->bots = $bots;
1130
    }
1131
1132
    /**
1133
     * Number of edits made to the page by current or former bots.
1134
     * @param string[] $bots Used only in unit tests, where we
1135
     *   supply mock data for the bots that will get processed.
1136
     * @return int
1137
     */
1138 2
    public function getBotRevisionCount($bots = null)
1139
    {
1140 2
        if (isset($this->botRevisionCount)) {
1141
            return $this->botRevisionCount;
1142
        }
1143
1144 2
        if ($bots === null) {
1145 1
            $bots = $this->getBots();
1146
        }
1147
1148 2
        $count = 0;
1149
1150 2
        foreach ($bots as $username => $data) {
1151 2
            $count += $data['count'];
1152
        }
1153
1154 2
        $this->botRevisionCount = $count;
1155 2
        return $count;
1156
    }
1157
1158
    /**
1159
     * Query for log events during each year of the article's history,
1160
     *   and set the results in $this->yearMonthCounts.
1161
     */
1162 1
    private function setLogsEvents()
1163
    {
1164 1
        $logData = $this->getRepository()->getLogEvents(
0 ignored issues
show
Bug introduced by
The method getLogEvents() does not exist on Xtools\Repository. Did you maybe mean getLog()? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1164
        $logData = $this->getRepository()->/** @scrutinizer ignore-call */ getLogEvents(

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1165 1
            $this->page,
1166 1
            $this->startDate,
1167 1
            $this->endDate
1168
        );
1169
1170 1
        foreach ($logData as $event) {
1171 1
            $time = strtotime($event['timestamp']);
1172 1
            $year = date('Y', $time);
1173
1174 1
            if (!isset($this->yearMonthCounts[$year])) {
1175
                break;
1176
            }
1177
1178 1
            $yearEvents = $this->yearMonthCounts[$year]['events'];
1179
1180
            // Convert log type value to i18n key.
1181 1
            switch ($event['log_type']) {
1182 1
                case 'protect':
1183 1
                    $action = 'protections';
1184 1
                    break;
1185 1
                case 'delete':
1186 1
                    $action = 'deletions';
1187 1
                    break;
1188
                case 'move':
1189
                    $action = 'moves';
1190
                    break;
1191
                // count pending-changes protections along with normal protections.
1192
                case 'stable':
1193
                    $action = 'protections';
1194
                    break;
1195
            }
1196
1197 1
            if (empty($yearEvents[$action])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $action does not seem to be defined for all execution paths leading up to this point.
Loading history...
1198 1
                $yearEvents[$action] = 1;
1199
            } else {
1200
                $yearEvents[$action]++;
1201
            }
1202
1203 1
            $this->yearMonthCounts[$year]['events'] = $yearEvents;
1204
        }
1205 1
    }
1206
1207
    /**
1208
     * Set statistics about the top 10 editors by added text and number of edits.
1209
     * This is ran *after* parseHistory() since we need the grand totals first.
1210
     * Various stats are also set for each editor in $this->editors to be used in the charts.
1211
     * @return integer Number of edits
1212
     */
1213 4
    private function setTopTenCounts()
1214
    {
1215 4
        $topTenCount = $counter = 0;
1216 4
        $topTenEditors = [];
1217
1218 4
        foreach ($this->editors as $editor => $info) {
1219
            // Count how many users are in the top 10% by number of edits, excluding bots.
1220 4
            if ($counter < 10 && !in_array($editor, array_keys($this->bots))) {
1221 4
                $topTenCount += $info['all'];
1222 4
                $counter++;
1223
1224
                // To be used in the Top Ten charts.
1225 4
                $topTenEditors[] = [
1226 4
                    'label' => $editor,
1227 4
                    'value' => $info['all'],
1228
                    'percentage' => (
1229 4
                        100 * ($info['all'] / $this->getNumRevisionsProcessed())
1230
                    )
1231
                ];
1232
            }
1233
1234
            // Compute the percentage of minor edits the user made.
1235 4
            $this->editors[$editor]['minorPercentage'] = $info['all']
1236 4
                ? ($info['minor'] / $info['all']) * 100
1237
                : 0;
1238
1239 4
            if ($info['all'] > 1) {
1240
                // Number of seconds/days between first and last edit.
1241 4
                $secs = $info['last']->getTimestamp() - $info['first']->getTimestamp();
1242 4
                $days = $secs / (60 * 60 * 24);
1243
1244
                // Average time between edits (in days).
1245 4
                $this->editors[$editor]['atbe'] = $days / $info['all'];
1246
            }
1247
1248 4
            if (count($info['sizes'])) {
1249
                // Average Total KB divided by number of stored sizes (usually the user's edit count to this page).
1250 4
                $this->editors[$editor]['size'] = array_sum($info['sizes']) / count($info['sizes']);
1251
            } else {
1252 4
                $this->editors[$editor]['size'] = 0;
1253
            }
1254
        }
1255
1256 4
        $this->topTenEditorsByEdits = $topTenEditors;
1257
1258
        // First sort editors array by the amount of text they added.
1259 4
        $topTenEditorsByAdded = $this->editors;
1260
        uasort($topTenEditorsByAdded, function ($a, $b) {
1261 4
            if ($a['added'] === $b['added']) {
1262 4
                return 0;
1263
            }
1264 4
            return $a['added'] > $b['added'] ? -1 : 1;
1265 4
        });
1266
1267
        // Then build a new array of top 10 editors by added text,
1268
        // in the data structure needed for the chart.
1269
        $this->topTenEditorsByAdded = array_map(function ($editor) {
1270 4
            $added = $this->editors[$editor]['added'];
1271
            return [
1272 4
                'label' => $editor,
1273 4
                'value' => $added,
1274
                'percentage' => (
1275 4
                    100 * ($added / $this->addedBytes)
1276
                )
1277
            ];
1278 4
        }, array_keys(array_slice($topTenEditorsByAdded, 0, 10)));
1279
1280 4
        $this->topTenCount = $topTenCount;
1281 4
    }
1282
1283
    /**
1284
     * Get authorship attribution from the WikiWho API.
1285
     * @see https://f-squared.org/wikiwho/
1286
     * @param  int $limit Max number of results.
1287
     * @return array
1288
     */
1289 1
    public function getTextshares($limit = null)
1290
    {
1291 1
        if (isset($this->textshares)) {
1292
            return $this->textshares;
1293
        }
1294
1295
        // TODO: check for failures. Should have a success:true
1296 1
        $ret = $this->getRepository()->getTextshares($this->page);
1297
1298
        // If revision can't be found, return error message.
1299 1
        if (!isset($ret['revisions'][0])) {
1300
            return [
1301
                'error' => isset($ret['Error']) ? $ret['Error'] : 'Unknown'
1302
            ];
1303
        }
1304
1305 1
        $revId = array_keys($ret['revisions'][0])[0];
1306 1
        $tokens = $ret['revisions'][0][$revId]['tokens'];
1307
1308 1
        list($counts, $totalCount, $userIds) = $this->countTokens($tokens);
1309 1
        $usernameMap = $this->getUsernameMap($userIds);
1310
1311 1
        if ($limit !== null) {
1312 1
            $countsToProcess = array_slice($counts, 0, $limit, true);
1313
        } else {
1314
            $countsToProcess = $counts;
1315
        }
1316
1317 1
        $textshares = [];
1318
1319
        // Loop through once more, creating an array with the user names (or IP address)
1320
        // as the key, and the count and percentage as the value.
1321 1
        foreach ($countsToProcess as $editor => $count) {
1322 1
            if (isset($usernameMap[$editor])) {
1323 1
                $index = $usernameMap[$editor];
1324
            } else {
1325 1
                $index = $editor;
1326
            }
1327 1
            $textshares[$index] = [
1328 1
                'count' => $count,
1329 1
                'percentage' => round(100 * ($count / $totalCount), 1)
1330
            ];
1331
        }
1332
1333 1
        $this->textshares = [
1334 1
            'list' => $textshares,
1335 1
            'totalAuthors' => count($counts),
1336 1
            'totalCount' => $totalCount,
1337
        ];
1338
1339 1
        return $this->textshares;
1340
    }
1341
1342
    /**
1343
     * Get a map of user IDs to usernames, given the IDs.
1344
     * @param  int[] $userIds
1345
     * @return array IDs as keys, usernames as values.
1346
     */
1347 1
    private function getUsernameMap($userIds)
1348
    {
1349 1
        $userIdsNames = $this->getRepository()->getUsernamesFromIds(
1350 1
            $this->page->getProject(),
1351 1
            $userIds
1352
        );
1353
1354 1
        $usernameMap = [];
1355 1
        foreach ($userIdsNames as $userIdName) {
1356 1
            $usernameMap[$userIdName['user_id']] = $userIdName['user_name'];
1357
        }
1358
1359 1
        return $usernameMap;
1360
    }
1361
1362
    /**
1363
     * Get counts of token lengths for each author. Used in self::getTextshares()
1364
     * @param  array $tokens
1365
     * @return array [counts by user, total count, IDs of accounts]
1366
     */
1367 1
    private function countTokens($tokens)
1368
    {
1369 1
        $counts = [];
1370 1
        $userIds = [];
1371 1
        $totalCount = 0;
1372
1373
        // Loop through the tokens, keeping totals (token length) for each author.
1374 1
        foreach ($tokens as $token) {
1375 1
            $editor = $token['editor'];
1376
1377
            // IPs are prefixed with '0|', otherwise it's the user ID.
1378 1
            if (substr($editor, 0, 2) === '0|') {
1379 1
                $editor = substr($editor, 2);
1380
            } else {
1381 1
                $userIds[] = $editor;
1382
            }
1383
1384 1
            if (!isset($counts[$editor])) {
1385 1
                $counts[$editor] = 0;
1386
            }
1387
1388 1
            $counts[$editor] += strlen($token['str']);
1389 1
            $totalCount += strlen($token['str']);
1390
        }
1391
1392
        // Sort authors by count.
1393 1
        arsort($counts);
1394
1395 1
        return [$counts, $totalCount, $userIds];
1396
    }
1397
1398
    /**
1399
     * Get a list of wikis supported by WikiWho.
1400
     * @return string[]
1401
     * @codeCoverageIgnore
1402
     */
1403
    public function getTextshareWikis()
1404
    {
1405
        return self::TEXTSHARE_WIKIS;
1406
    }
1407
1408
    /**
1409
     * Get prose and reference information.
1410
     * @return array With keys 'characters', 'words', 'references', 'unique_references'
1411
     */
1412 1
    public function getProseStats()
1413
    {
1414 1
        $datetime = $this->endDate !== false ? new DateTime('@'.$this->endDate) : null;
1415 1
        $html = $this->page->getHTMLContent($datetime);
1416
1417 1
        $crawler = new Crawler($html);
1418
1419 1
        list($chars, $words) = $this->countCharsAndWords($crawler, '#mw-content-text p');
1420
1421 1
        $refs = $crawler->filter('#mw-content-text .reference');
1422 1
        $refContent = [];
1423
        $refs->each(function ($ref) use (&$refContent) {
1424 1
            $refContent[] = $ref->text();
1425 1
        });
1426 1
        $uniqueRefs = count(array_unique($refContent));
1427
1428 1
        $sections = count($crawler->filter('#mw-content-text .mw-headline'));
1429
1430
        return [
1431 1
            'characters' => $chars,
1432 1
            'words' => $words,
1433 1
            'references' => $refs->count(),
1434 1
            'unique_references' => $uniqueRefs,
1435 1
            'sections' => $sections,
1436
        ];
1437
    }
1438
1439
    /**
1440
     * Count the number of characters and words of the plain text
1441
     * within the DOM element matched by the given selector.
1442
     * @param  Crawler $crawler
1443
     * @param  string $selector HTML selector.
1444
     * @return array [num chars, num words]
1445
     */
1446 1
    private function countCharsAndWords($crawler, $selector)
1447
    {
1448 1
        $totalChars = 0;
1449 1
        $totalWords = 0;
1450 1
        $paragraphs = $crawler->filter($selector);
1451 1
        $paragraphs->each(function ($node) use (&$totalChars, &$totalWords) {
1452 1
            $text = preg_replace('/\[\d+\]/', '', trim($node->text()));
1453 1
            $totalChars += strlen($text);
1454 1
            $totalWords += count(explode(' ', $text));
1455 1
        });
1456
1457 1
        return [$totalChars, $totalWords];
1458
    }
1459
1460
    /**
1461
     * Fetch transclusion data (categories, templates and files)
1462
     * that are on the page.
1463
     * @return array With keys 'categories', 'templates' and 'files'.
1464
     */
1465 1
    private function getTransclusionData()
1466
    {
1467 1
        if (!is_array($this->transclusionData)) {
0 ignored issues
show
introduced by
The condition ! is_array($this->transclusionData) can never be true.
Loading history...
1468 1
            $this->transclusionData = $this->getRepository()
1469 1
                ->getTransclusionData($this->page);
1470
        }
1471 1
        return $this->transclusionData;
1472
    }
1473
1474
    /**
1475
     * Get the number of categories that are on the page.
1476
     * @return int
1477
     */
1478 1
    public function getNumCategories()
1479
    {
1480 1
        return $this->getTransclusionData()['categories'];
1481
    }
1482
1483
    /**
1484
     * Get the number of templates that are on the page.
1485
     * @return int
1486
     */
1487 1
    public function getNumTemplates()
1488
    {
1489 1
        return $this->getTransclusionData()['templates'];
1490
    }
1491
1492
    /**
1493
     * Get the number of files that are on the page.
1494
     * @return int
1495
     */
1496 1
    public function getNumFiles()
1497
    {
1498 1
        return $this->getTransclusionData()['files'];
1499
    }
1500
}
1501