| Total Complexity | 40 |
| Total Lines | 244 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like PublishDateExtractor often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use PublishDateExtractor, and based on these observations, apply Extract Interface, too.
| 1 | <?php declare(strict_types=1); |
||
| 16 | class PublishDateExtractor extends AbstractModule implements ModuleInterface { |
||
| 17 | use ArticleMutatorTrait; |
||
| 18 | |||
| 19 | /** @inheritdoc */ |
||
| 20 | public function run(Article $article): self { |
||
| 21 | $this->article($article); |
||
| 22 | |||
| 23 | $dt = null; |
||
|
|
|||
| 24 | |||
| 25 | $dt = $this->getDateFromSchemaOrg(); |
||
| 26 | |||
| 27 | if (is_null($dt)) { |
||
| 28 | $dt = $this->getDateFromOpenGraph(); |
||
| 29 | } |
||
| 30 | |||
| 31 | if (is_null($dt)) { |
||
| 32 | $dt = $this->getDateFromURL(); |
||
| 33 | } |
||
| 34 | |||
| 35 | if (is_null($dt)) { |
||
| 36 | $dt = $this->getDateFromDublinCore(); |
||
| 37 | } |
||
| 38 | |||
| 39 | if (is_null($dt)) { |
||
| 40 | $dt = $this->getDateFromParsely(); |
||
| 41 | } |
||
| 42 | |||
| 43 | $article->setPublishDate($dt); |
||
| 44 | |||
| 45 | return $this; |
||
| 46 | } |
||
| 47 | |||
| 48 | /** |
||
| 49 | * @return \DateTime|null |
||
| 50 | */ |
||
| 51 | private function getDateFromURL(): ?\DateTime { |
||
| 52 | // Determine date based on URL |
||
| 53 | if (preg_match('@(?:[\d]{4})(?<delimiter>[/-])(?:[\d]{2})\k<delimiter>(?:[\d]{2})@U', $this->article()->getFinalUrl(), $matches)) { |
||
| 54 | $dt = \DateTime::createFromFormat('Y' . $matches['delimiter'] . 'm' . $matches['delimiter'] . 'd', $matches[0]); |
||
| 55 | $dt->setTime(0, 0, 0); |
||
| 56 | |||
| 57 | if ($dt === false) { |
||
| 58 | return null; |
||
| 59 | } |
||
| 60 | |||
| 61 | return $dt; |
||
| 62 | } |
||
| 63 | |||
| 64 | /** @todo Add more date detection methods */ |
||
| 65 | |||
| 66 | return null; |
||
| 67 | } |
||
| 68 | |||
| 69 | /** |
||
| 70 | * Check for and determine dates from Schema.org's datePublished property. |
||
| 71 | * |
||
| 72 | * Checks HTML tags (e.g. <meta>, <time>, etc.) and JSON-LD. |
||
| 73 | * |
||
| 74 | * @return \DateTime|null |
||
| 75 | * |
||
| 76 | * @see https://schema.org/datePublished |
||
| 77 | */ |
||
| 78 | private function getDateFromSchemaOrg(): ?\DateTime { |
||
| 79 | $dt = null; |
||
| 80 | |||
| 81 | // Check for HTML tags (<meta>, <time>, etc.) |
||
| 82 | $nodes = $this->article()->getRawDoc()->find('*[itemprop="datePublished"]'); |
||
| 83 | |||
| 84 | /* @var $node Element */ |
||
| 85 | foreach ($nodes as $node) { |
||
| 86 | try { |
||
| 87 | if ($node->hasAttribute('datetime')) { |
||
| 88 | $dt = new \DateTime($node->getAttribute('datetime')); |
||
| 89 | break; |
||
| 90 | } |
||
| 91 | if ($node->hasAttribute('content')) { |
||
| 92 | $dt = new \DateTime($node->getAttribute('content')); |
||
| 93 | break; |
||
| 94 | } |
||
| 95 | } |
||
| 96 | catch (\Exception $e) { |
||
| 97 | // Do nothing here in case the node has unrecognizable date information. |
||
| 98 | } |
||
| 99 | } |
||
| 100 | |||
| 101 | if (!is_null($dt)) { |
||
| 102 | return $dt; |
||
| 103 | } |
||
| 104 | |||
| 105 | // Check for JSON-LD |
||
| 106 | $nodes = $this->article()->getRawDoc()->find('script[type="application/ld+json"]'); |
||
| 107 | |||
| 108 | /* @var $node Element */ |
||
| 109 | foreach ($nodes as $node) { |
||
| 110 | try { |
||
| 111 | $json = json_decode($node->text()); |
||
| 112 | if (isset($json->datePublished)) { |
||
| 113 | $dt = new \DateTime($json->datePublished); |
||
| 114 | break; |
||
| 115 | } |
||
| 116 | } |
||
| 117 | catch (\Exception $e) { |
||
| 118 | // Do nothing here in case the node has unrecognizable date information. |
||
| 119 | } |
||
| 120 | } |
||
| 121 | |||
| 122 | return $dt; |
||
| 123 | } |
||
| 124 | |||
| 125 | /** |
||
| 126 | * Check for and determine dates based on Dublin Core standards. |
||
| 127 | * |
||
| 128 | * @return \DateTime|null |
||
| 129 | * |
||
| 130 | * @see http://dublincore.org/documents/dcmi-terms/#elements-date |
||
| 131 | * @see http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml |
||
| 132 | */ |
||
| 133 | private function getDateFromDublinCore(): ?\DateTime { |
||
| 155 | } |
||
| 156 | |||
| 157 | /** |
||
| 158 | * Check for and determine dates based on OpenGraph standards. |
||
| 159 | * |
||
| 160 | * @return \DateTime|null |
||
| 161 | * |
||
| 162 | * @see http://ogp.me/ |
||
| 163 | * @see http://ogp.me/#type_article |
||
| 164 | */ |
||
| 165 | private function getDateFromOpenGraph(): ?\DateTime { |
||
| 183 | } |
||
| 184 | |||
| 185 | /** |
||
| 186 | * Check for and determine dates based on Parsely metadata. |
||
| 187 | * |
||
| 188 | * Checks JSON-LD, <meta> tags and parsely-page. |
||
| 189 | * |
||
| 190 | * @return \DateTime|null |
||
| 191 | * |
||
| 192 | * @see https://www.parsely.com/help/integration/jsonld/ |
||
| 193 | * @see https://www.parsely.com/help/integration/metatags/ |
||
| 194 | * @see https://www.parsely.com/help/integration/ppage/ |
||
| 195 | */ |
||
| 196 | private function getDateFromParsely(): ?\DateTime { |
||
| 260 | } |
||
| 261 | } |
||
| 262 |