Total Complexity | 57 |
Total Lines | 680 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like EventWikiRepository often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use EventWikiRepository, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
20 | class EventWikiRepository extends Repository |
||
21 | { |
||
22 | /** Max number of pages to be returned when generating page IDs. */ |
||
23 | public const MAX_PAGES = 50000; |
||
24 | |||
25 | /** |
||
26 | * Class name of associated entity. |
||
27 | * Implements Repository::getEntityClass |
||
28 | * @return string |
||
29 | */ |
||
30 | public function getEntityClass(): string |
||
31 | { |
||
32 | return EventWiki::class; |
||
33 | } |
||
34 | |||
35 | /** |
||
36 | * Get the wiki's domain name without the .org given a database name or domain. |
||
37 | * @param string $value |
||
38 | * @return string|null Null if no wiki was found. |
||
39 | */ |
||
40 | public function getDomainFromEventWikiInput(string $value): ?string |
||
41 | { |
||
42 | if ('*.' === substr($value, 0, 2)) { |
||
43 | $ret = $this->getWikiFamilyName(substr($value, 2)); |
||
44 | return null !== $ret ? '*.'.$ret : null; |
||
45 | } |
||
46 | |||
47 | $conn = $this->getMetaConnection(); |
||
48 | $rqb = $conn->createQueryBuilder(); |
||
49 | $rqb->select(['dbname, url']) |
||
50 | ->from('wiki') |
||
51 | ->where($rqb->expr()->eq('dbname', ':project')) |
||
52 | ->orWhere($rqb->expr()->like('url', ':projectUrl')) |
||
53 | ->orWhere($rqb->expr()->like('url', ':projectUrl2')) |
||
54 | ->setParameter('project', str_replace('_p', '', $value)) |
||
55 | ->setParameter('projectUrl', "https://$value") |
||
56 | ->setParameter('projectUrl2', "https://$value.org"); |
||
57 | $ret = $this->executeQueryBuilder($rqb)->fetch(); |
||
58 | |||
59 | // No matches found. |
||
60 | if (!$ret) { |
||
61 | return null; |
||
62 | } |
||
63 | |||
64 | // Extract and return just the domain name without '.org' suffix. |
||
65 | $matches = []; |
||
66 | preg_match('/^https?\:\/\/(.*)\.org$/', $ret['url'], $matches); |
||
67 | if (isset($matches[1])) { |
||
68 | return $matches[1]; |
||
69 | } else { |
||
70 | return null; |
||
71 | } |
||
72 | } |
||
73 | |||
74 | /** |
||
75 | * This effectively validates the given name as a wiki family |
||
76 | * (wikipedia, wiktionary, etc). Null is returned if invalid. |
||
77 | * @param string $value The wiki family name. |
||
78 | * @return string|null The wiki family name, or null if invalid. |
||
79 | */ |
||
80 | public function getWikiFamilyName(string $value): ?string |
||
81 | { |
||
82 | $conn = $this->getMetaConnection(); |
||
83 | $rqb = $conn->createQueryBuilder(); |
||
84 | $rqb->select(['family']) |
||
85 | ->from('wiki') |
||
86 | ->where($rqb->expr()->eq('family', ':family')) |
||
87 | ->setParameter('family', $value); |
||
88 | return $this->executeQueryBuilder($rqb)->fetch()['family']; |
||
89 | } |
||
90 | |||
91 | /** |
||
92 | * Get the database name of the given (partial) domain name. |
||
93 | * @param string $domain The domain name, without trailing '.org'. |
||
94 | * @return string Null if not found. |
||
95 | */ |
||
96 | public function getDbNameFromDomain(string $domain): string |
||
97 | { |
||
98 | $projectUrl = "https://$domain.org"; |
||
99 | |||
100 | $conn = $this->getMetaConnection(); |
||
101 | $rqb = $conn->createQueryBuilder(); |
||
102 | $rqb->select(["CONCAT(dbname, '_p') AS dbname"]) |
||
103 | ->from('wiki') |
||
104 | ->where('url = :projectUrl') |
||
105 | ->setParameter('projectUrl', $projectUrl); |
||
106 | |||
107 | $row = $this->executeQueryBuilder($rqb)->fetch(); |
||
108 | if (!isset($row['dbname'])) { |
||
109 | throw new Exception("Unable to determine database name for domain '$domain'."); |
||
110 | } |
||
111 | return $row['dbname']; |
||
112 | } |
||
113 | |||
114 | /** |
||
115 | * Public static method to convert wikitext to HTML, can be used on any arbitrary string. |
||
116 | * Does NOT support section links unless you specify a page. |
||
117 | * @param string $wikitext |
||
118 | * @param string $domain The project domain such as en.wikipedia |
||
119 | * @param string $pageTitle The title of the page, including namespace. |
||
120 | * @return string |
||
121 | * @static |
||
122 | */ |
||
123 | public static function wikifyString(string $wikitext, string $domain, ?string $pageTitle = null): string |
||
124 | { |
||
125 | $wikitext = htmlspecialchars(html_entity_decode($wikitext), ENT_NOQUOTES); |
||
126 | $sectionMatch = null; |
||
127 | $isSection = preg_match_all("/^\/\* (.*?) \*\//", $wikitext, $sectionMatch); |
||
128 | $pagePath = "https://$domain.org/wiki/"; |
||
129 | |||
130 | if ($isSection && isset($pageTitle)) { |
||
131 | $pageUrl = $pagePath.ucfirst(str_replace(' ', '_', $pageTitle)); |
||
132 | $sectionTitle = $sectionMatch[1][0]; |
||
133 | |||
134 | // Must have underscores for the link to properly go to the section. |
||
135 | $sectionTitleLink = htmlspecialchars(str_replace(' ', '_', $sectionTitle)); |
||
136 | |||
137 | $sectionWikitext = "<a target='_blank' href='$pageUrl#$sectionTitleLink'>→</a>". |
||
138 | "<em class='text-muted'>".htmlspecialchars($sectionTitle).":</em> "; |
||
139 | $wikitext = str_replace($sectionMatch[0][0], trim($sectionWikitext), $wikitext); |
||
140 | } |
||
141 | |||
142 | return self::wikifyInternalLinks($wikitext, $domain); |
||
143 | } |
||
144 | |||
145 | /** |
||
146 | * Converts internal links in wikitext to HTML. |
||
147 | * @param string $wikitext |
||
148 | * @param string $domain The project domain such as en.wikipedia |
||
149 | * @return string Updated wikitext. |
||
150 | * @static |
||
151 | */ |
||
152 | private static function wikifyInternalLinks(string $wikitext, string $domain): string |
||
153 | { |
||
154 | $pagePath = "https://$domain.org/wiki/"; |
||
155 | $linkMatch = null; |
||
156 | |||
157 | while (preg_match_all("/\[\[:?(.*?)\]\]/", $wikitext, $linkMatch)) { |
||
158 | $wikiLinkParts = explode('|', $linkMatch[1][0]); |
||
159 | $wikiLinkPath = htmlspecialchars($wikiLinkParts[0]); |
||
160 | $wikiLinkText = htmlspecialchars( |
||
161 | $wikiLinkParts[1] ?? $wikiLinkPath |
||
162 | ); |
||
163 | |||
164 | // Use normalized page title (underscored, capitalized). |
||
165 | $pageUrl = $pagePath.ucfirst(str_replace(' ', '_', $wikiLinkPath)); |
||
166 | $link = "<a target='_blank' href='$pageUrl'>$wikiLinkText</a>"; |
||
167 | $wikitext = str_replace($linkMatch[0][0], $link, $wikitext); |
||
168 | } |
||
169 | |||
170 | return $wikitext; |
||
171 | } |
||
172 | |||
173 | /** |
||
174 | * Get all available wikis on the replicas, as defined by EventWiki::VALID_WIKI_PATTERN. |
||
175 | * @return string[] With domain as the keys, database name as the values. |
||
176 | */ |
||
177 | public function getAvailableWikis(): array |
||
197 | } |
||
198 | |||
199 | /** |
||
200 | * Get all unique page IDs edited/created within the Event for the given wiki. If you need to do this for pages |
||
201 | * within specific categories, without participants, use EventCategoryRepository::getPagesInCategories(). |
||
202 | * @param string $dbName |
||
203 | * @param DateTime $start |
||
204 | * @param DateTime $end |
||
205 | * @param int[] $actors |
||
206 | * @param string[] $categoryTitles |
||
207 | * @param string $type Whether only pages 'created' or 'edited' should be returned. Default is to return both. |
||
208 | * To get pages improved, first get edited then use array_diff against created. |
||
209 | * @return int[] |
||
210 | */ |
||
211 | public function getPageIds( |
||
270 | } |
||
271 | |||
272 | /** |
||
273 | * Get the total pageviews count for a set of pages, from a given date until today. Optionally reduce to an average |
||
274 | * of the last N days, where N is Event::AVAILABLE_METRICS['pages-improved-pageviews-avg']. |
||
275 | * @param string $dbName |
||
276 | * @param string $domain |
||
277 | * @param DateTime $start |
||
278 | * @param int[] $pageIds |
||
279 | * @param bool $getDailyAverage |
||
280 | * @return int |
||
281 | */ |
||
282 | public function getPageviews( |
||
319 | } |
||
320 | |||
321 | /** |
||
322 | * Get the page titles of the pages with the given IDs. |
||
323 | * @param string $dbName |
||
324 | * @param int[] $pageIds |
||
325 | * @param bool $stmt Whether to get only the statement, so that the calling method can use fetch(). |
||
326 | * @param bool $includePageIds Whether to include page IDs in the result. |
||
327 | * @return mixed[]|\Doctrine\DBAL\Driver\ResultStatement |
||
328 | */ |
||
329 | public function getPageTitles(string $dbName, array $pageIds, bool $stmt = false, bool $includePageIds = false) |
||
340 | } |
||
341 | |||
342 | /** |
||
343 | * Calculates the number of bytes changed during an event |
||
344 | * |
||
345 | * @param Event $event |
||
346 | * @param string $dbName |
||
347 | * @param int[] $pageIds |
||
348 | * @param int[] $actors |
||
349 | * @return int |
||
350 | */ |
||
351 | public function getBytesChanged(Event $event, string $dbName, array $pageIds, array $actors): int |
||
400 | } |
||
401 | |||
402 | /** |
||
403 | * Get the list of users participating in an event with no predefined user list |
||
404 | * |
||
405 | * @param string $dbName |
||
406 | * @param int[] $pageIds |
||
407 | * @param Event $event |
||
408 | * @return string[] |
||
409 | */ |
||
410 | public function getUsersFromPageIDs(string $dbName, array $pageIds, Event $event): array |
||
425 | } |
||
426 | |||
427 | /** |
||
428 | * Get data for a single page, to be included in the Pages Created report. |
||
429 | * @param string $dbName |
||
430 | * @param int $pageId |
||
431 | * @param string $pageTitle |
||
432 | * @param string[] $usernames |
||
433 | * @param DateTime $end |
||
434 | * @return string[] |
||
435 | */ |
||
436 | public function getSinglePageCreatedData( |
||
437 | string $dbName, |
||
438 | int $pageId, |
||
439 | string $pageTitle, |
||
440 | array $usernames, |
||
441 | DateTime $end |
||
442 | ): array { |
||
443 | // Use cache if it exists. |
||
444 | $cacheKey = $this->getCacheKey(func_get_args(), 'pages_created_info'); |
||
445 | if ($this->cache->hasItem($cacheKey)) { |
||
446 | return $this->cache->getItem($cacheKey)->get(); |
||
447 | } |
||
448 | |||
449 | $end = $end->format('YmdHis'); |
||
450 | $usernamesSql = empty($usernames) ? '' : 'AND rev_user_text IN (:usernames)'; |
||
451 | |||
452 | // Only use revision_userindex when filtering by user. |
||
453 | $userRevisionTable = empty($usernames) ? 'revision' : 'revision_userindex'; |
||
454 | |||
455 | $sql = "SELECT `metric`, `value` FROM ( |
||
456 | ( |
||
457 | SELECT 'creator' AS `metric`, rev_user_text AS `value` |
||
458 | FROM $dbName.revision |
||
459 | WHERE rev_page = :pageId |
||
460 | LIMIT 1 |
||
461 | ) UNION ( |
||
462 | SELECT 'edits' AS `metric`, COUNT(*) AS `value` |
||
463 | FROM $dbName.$userRevisionTable |
||
464 | WHERE rev_page = :pageId |
||
465 | AND rev_timestamp <= :end |
||
466 | $usernamesSql |
||
467 | ) UNION ( |
||
468 | SELECT 'bytes' AS `metric`, rev_len AS `value` |
||
469 | FROM $dbName.$userRevisionTable |
||
470 | WHERE rev_page = :pageId |
||
471 | AND rev_timestamp <= :end |
||
472 | $usernamesSql |
||
473 | ORDER BY rev_timestamp DESC |
||
474 | LIMIT 1 |
||
475 | ) UNION ( |
||
476 | SELECT 'links' AS `metric`, COUNT(*) AS `value` |
||
477 | FROM $dbName.pagelinks |
||
478 | JOIN $dbName.page ON page_id = pl_from |
||
479 | WHERE pl_from_namespace = 0 |
||
480 | AND pl_namespace = 0 |
||
481 | AND pl_title = :pageTitle |
||
482 | AND page_is_redirect = 0 |
||
483 | ) |
||
484 | ) t1"; |
||
485 | |||
486 | $ret = $this->executeReplicaQueryWithTypes( |
||
487 | $sql, |
||
488 | [ |
||
489 | 'pageId' => $pageId, |
||
490 | 'pageTitle' => $pageTitle, |
||
491 | 'usernames' => $usernames, |
||
492 | 'end' => $end, |
||
493 | ], |
||
494 | [ |
||
495 | 'usernames' => Connection::PARAM_STR_ARRAY, |
||
496 | ] |
||
497 | )->fetchAll(\PDO::FETCH_KEY_PAIR); |
||
498 | |||
499 | // Cache for 10 minutes. |
||
500 | return $this->setCache($cacheKey, $ret, 'PT10M'); |
||
501 | } |
||
502 | |||
503 | /** |
||
504 | * Get the data needed for the Pages Created report, for a single EventWiki. |
||
505 | * @param EventWiki $wiki |
||
506 | * @param string[] $usernames |
||
507 | * @return mixed[] |
||
508 | */ |
||
509 | public function getPagesCreatedData(EventWiki $wiki, array $usernames): array |
||
510 | { |
||
511 | if ($wiki->isFamilyWiki()) { |
||
512 | return []; |
||
513 | } |
||
514 | |||
515 | $dbName = $this->getDbNameFromDomain($wiki->getDomain()); |
||
516 | $pageviewsRepo = $this->getPageviewsRepository(); |
||
517 | $avgPageviewsOffset = Event::AVAILABLE_METRICS['pages-improved-pageviews-avg']; |
||
518 | $pages = $this->getPageTitles($dbName, $wiki->getPagesCreated(), true, true); |
||
519 | $start = $wiki->getEvent()->getStartUTC(); |
||
520 | $end = $wiki->getEvent()->getEndUTC(); |
||
521 | $now = new DateTime('yesterday midnight'); |
||
522 | $data = []; |
||
523 | |||
524 | while ($page = $pages->fetch()) { |
||
525 | // FIXME: async? |
||
526 | [$pageviews, $avgPageviews] = $pageviewsRepo->getPageviews( |
||
527 | $wiki->getDomain(), |
||
528 | [$page['page_title']], |
||
529 | $start, |
||
530 | $now, |
||
531 | $avgPageviewsOffset |
||
532 | ); |
||
533 | |||
534 | $pageInfo = $this->getSinglePageCreatedData( |
||
535 | $dbName, |
||
536 | (int)$page['page_id'], |
||
537 | $page['page_title'], |
||
538 | $usernames, |
||
539 | $end |
||
540 | ); |
||
541 | |||
542 | $data[] = array_merge($pageInfo, [ |
||
543 | 'pageTitle' => $page['page_title'], |
||
544 | 'wiki' => $wiki->getDomain(), |
||
545 | 'pageviews' => (int)$pageviews, |
||
546 | 'avgPageviews' => (int)$avgPageviews, |
||
547 | ]); |
||
548 | } |
||
549 | |||
550 | return $data; |
||
551 | } |
||
552 | |||
553 | /** |
||
554 | * Get data for a single page, to be included in the Pages Improved report. |
||
555 | * @param string $dbName |
||
556 | * @param int $pageId |
||
557 | * @param string $pageTitle |
||
558 | * @param string[] $usernames |
||
559 | * @param DateTime $start |
||
560 | * @param DateTime $end |
||
561 | * @return string[] |
||
562 | * @throws \Psr\Cache\InvalidArgumentException |
||
563 | */ |
||
564 | public function getSinglePageImprovedData( |
||
565 | string $dbName, |
||
566 | int $pageId, |
||
567 | string $pageTitle, |
||
568 | array $usernames, |
||
569 | DateTime $start, |
||
570 | DateTime $end |
||
571 | ): array { |
||
572 | // Use cache if it exists. |
||
573 | $cacheKey = $this->getCacheKey(func_get_args(), 'pages_improved_info'); |
||
574 | if ($this->cache->hasItem($cacheKey)) { |
||
575 | return $this->cache->getItem($cacheKey)->get(); |
||
576 | } |
||
577 | |||
578 | $start = $start->format('YmdHis'); |
||
579 | $end = $end->format('YmdHis'); |
||
580 | $usernamesSql = empty($usernames) ? '' : 'AND rev.rev_user_text IN (:usernames)'; |
||
581 | |||
582 | // Only use revision_userindex when filtering by user. |
||
583 | $userRevisionTable = empty($usernames) ? 'revision' : 'revision_userindex'; |
||
584 | |||
585 | $sql = "SELECT `metric`, `value` FROM ( |
||
586 | ( |
||
587 | SELECT 'edits' AS `metric`, COUNT(*) AS `value` |
||
588 | FROM $dbName.$userRevisionTable rev |
||
589 | WHERE rev_page = :pageId |
||
590 | AND rev_timestamp BETWEEN :start AND :end |
||
591 | $usernamesSql |
||
592 | ) UNION ( |
||
593 | SELECT 'start_bytes' AS `metric`, COALESCE(prev.rev_len, 0) AS `value` |
||
594 | FROM $dbName.$userRevisionTable rev |
||
595 | LEFT JOIN $dbName.$userRevisionTable prev ON rev.rev_parent_id=prev.rev_id |
||
596 | WHERE rev.rev_page=:pageId |
||
597 | AND rev.rev_timestamp BETWEEN :start AND :end |
||
598 | {$usernamesSql} |
||
599 | ORDER BY rev.rev_timestamp ASC |
||
600 | LIMIT 1 |
||
601 | ) UNION ( |
||
602 | SELECT 'end_bytes' AS `metric`, COALESCE(rev_len, 0) AS `value` |
||
603 | FROM $dbName.$userRevisionTable rev |
||
604 | WHERE rev_page=:pageId |
||
605 | AND rev_timestamp BETWEEN :start AND :end |
||
606 | {$usernamesSql} |
||
607 | ORDER BY rev_timestamp DESC |
||
608 | LIMIT 1 |
||
609 | ) UNION ( |
||
610 | SELECT 'links' AS `metric`, COUNT(*) AS `value` |
||
611 | FROM $dbName.pagelinks |
||
612 | JOIN $dbName.page ON page_id = pl_from |
||
613 | WHERE pl_from_namespace = 0 |
||
614 | AND pl_namespace = 0 |
||
615 | AND pl_title = :pageTitle |
||
616 | AND page_is_redirect = 0 |
||
617 | ) |
||
618 | ) t1"; |
||
619 | |||
620 | $rows = $this->executeReplicaQueryWithTypes( |
||
621 | $sql, |
||
622 | [ |
||
623 | 'pageId' => $pageId, |
||
624 | 'pageTitle' => $pageTitle, |
||
625 | 'usernames' => $usernames, |
||
626 | 'start' => $start, |
||
627 | 'end' => $end, |
||
628 | ], |
||
629 | [ |
||
630 | 'usernames' => Connection::PARAM_STR_ARRAY, |
||
631 | ] |
||
632 | )->fetchAll(\PDO::FETCH_KEY_PAIR); |
||
633 | |||
634 | $ret = [ |
||
635 | 'edits' => $rows['edits'], |
||
636 | 'links' => $rows['links'], |
||
637 | 'bytes' => $rows['end_bytes'] - $rows['start_bytes'], |
||
638 | ]; |
||
639 | |||
640 | // Cache for 10 minutes. |
||
641 | return $this->setCache($cacheKey, $ret, 'PT10M'); |
||
642 | } |
||
643 | |||
644 | /** |
||
645 | * Get the data needed for the Pages Created report, for a single EventWiki. |
||
646 | * @param EventWiki $wiki |
||
647 | * @param string[] $usernames |
||
648 | * @return mixed[] |
||
649 | */ |
||
650 | public function getPagesImprovedData(EventWiki $wiki, array $usernames): array |
||
689 | } |
||
690 | |||
691 | /** |
||
692 | * Creates and initializes a pageviews repository |
||
693 | * @return PageviewsRepository |
||
694 | */ |
||
695 | private function getPageviewsRepository(): PageviewsRepository |
||
702 |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.