Completed
Branch master (803fe5)
by Andrew
05:47 queued 03:48
created

PublishDateExtractor::getDateFromSchemaOrg()   D

Complexity

Conditions 9
Paths 48

Size

Total Lines 45
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 45
c 0
b 0
f 0
rs 4.909
cc 9
eloc 22
nc 48
nop 0
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Traits\ArticleMutatorTrait;
7
use Goose\Modules\{AbstractModule, ModuleInterface};
8
use DOMWrap\Element;
9
10
/**
11
 * Publish Date Extractor
12
 *
13
 * @package Goose\Modules\Extractors
14
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
15
 */
16
class PublishDateExtractor extends AbstractModule implements ModuleInterface {
17
    use ArticleMutatorTrait;
18
19
    /** @inheritdoc  */
20
    public function run(Article $article): self {
21
        $this->article($article);
22
23
        $dt = null;
0 ignored issues
show
Unused Code introduced by
The assignment to $dt is dead and can be removed.
Loading history...
24
25
        $dt = $this->getDateFromSchemaOrg();
26
27
        if (is_null($dt)) {
28
            $dt = $this->getDateFromOpenGraph();
29
        }
30
31
        if (is_null($dt)) {
32
            $dt = $this->getDateFromURL();
33
        }
34
35
        if (is_null($dt)) {
36
            $dt = $this->getDateFromDublinCore();
37
        }
38
39
        if (is_null($dt)) {
40
            $dt = $this->getDateFromParsely();
41
        }
42
43
        $article->setPublishDate($dt);
44
45
        return $this;
46
    }
47
48
    /**
49
     * @return \DateTime|null
50
     */
51
    private function getDateFromURL(): ?\DateTime {
52
        // Determine date based on URL
53
        if (preg_match('@(?:[\d]{4})(?<delimiter>[/-])(?:[\d]{2})\k<delimiter>(?:[\d]{2})@U', $this->article()->getFinalUrl(), $matches)) {
54
            $dt = \DateTime::createFromFormat('Y' . $matches['delimiter'] . 'm' . $matches['delimiter'] . 'd', $matches[0]);
55
            $dt->setTime(0, 0, 0);
56
57
            if ($dt === false) {
58
                return null;
59
            }
60
61
            return $dt;
62
        }
63
64
        /** @todo Add more date detection methods */
65
66
        return null;
67
    }
68
69
    /**
70
     * Check for and determine dates from Schema.org's datePublished property.
71
     *
72
     * Checks HTML tags (e.g. <meta>, <time>, etc.) and JSON-LD.
73
     *
74
     * @return \DateTime|null
75
     *
76
     * @see https://schema.org/datePublished
77
     */
78
    private function getDateFromSchemaOrg(): ?\DateTime {
79
        $dt = null;
80
81
        // Check for HTML tags (<meta>, <time>, etc.)
82
        $nodes = $this->article()->getRawDoc()->find('*[itemprop="datePublished"]');
83
84
        /* @var $node Element */
85
        foreach ($nodes as $node) {
86
            try {
87
                if ($node->hasAttribute('datetime')) {
88
                    $dt = new \DateTime($node->getAttribute('datetime'));
89
                    break;
90
                }
91
                if ($node->hasAttribute('content')) {
92
                    $dt = new \DateTime($node->getAttribute('content'));
93
                    break;
94
                }
95
            }
96
            catch (\Exception $e) {
97
                // Do nothing here in case the node has unrecognizable date information.
98
            }
99
        }
100
101
        if (!is_null($dt)) {
102
            return $dt;
103
        }
104
105
        // Check for JSON-LD
106
        $nodes = $this->article()->getRawDoc()->find('script[type="application/ld+json"]');
107
108
        /* @var $node Element */
109
        foreach ($nodes as $node) {
110
            try {
111
                $json = json_decode($node->text());
112
                if (isset($json->datePublished)) {
113
                    $dt = new \DateTime($json->datePublished);
114
                    break;
115
                }
116
            }
117
            catch (\Exception $e) {
118
                // Do nothing here in case the node has unrecognizable date information.
119
            }
120
        }
121
122
        return $dt;
123
    }
124
125
    /**
126
     * Check for and determine dates based on Dublin Core standards.
127
     *
128
     * @return \DateTime|null
129
     *
130
     * @see http://dublincore.org/documents/dcmi-terms/#elements-date
131
     * @see http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml
132
     */
133
    private function getDateFromDublinCore(): ?\DateTime {
134
        $dt = null;
135
        $nodes = $this->article()->getRawDoc()->find('*[name="dc.date"], *[name="dc.date.issued"], *[name="DC.date.issued"]');
136
137
        /* @var $node Element */
138
        foreach ($nodes as $node) {
139
            try {
140
                if ($node->hasAttribute('content')) {
141
                    $dt = new \DateTime($node->getAttribute('content'));
142
                    break;
143
                }
144
            }
145
            catch (\Exception $e) {
146
                // Do nothing here in case the node has unrecognizable date information.
147
            }
148
        }
149
150
        if (!is_null($dt)) {
151
            return $dt;
152
        }
153
154
        return $dt;
155
    }
156
157
    /**
158
     * Check for and determine dates based on OpenGraph standards.
159
     *
160
     * @return \DateTime|null
161
     *
162
     * @see http://ogp.me/
163
     * @see http://ogp.me/#type_article
164
     */
165
    private function getDateFromOpenGraph(): ?\DateTime {
166
        $dt = null;
167
168
        $og_data = $this->article()->getOpenGraph();
169
170
        try {
171
            if (isset($og_data['published_time'])) {
172
                $dt = new \DateTime($og_data['published_time']);
173
            }
174
            if (is_null($dt) && isset($og_data['pubdate'])) {
175
                $dt = new \DateTime($og_data['pubdate']);
176
            }
177
        }
178
        catch (\Exception $e) {
179
            // Do nothing here in case the node has unrecognizable date information.
180
        }
181
182
        return $dt;
183
    }
184
185
    /**
186
     * Check for and determine dates based on Parsely metadata.
187
     *
188
     * Checks JSON-LD, <meta> tags and parsely-page.
189
     *
190
     * @return \DateTime|null
191
     *
192
     * @see https://www.parsely.com/help/integration/jsonld/
193
     * @see https://www.parsely.com/help/integration/metatags/
194
     * @see https://www.parsely.com/help/integration/ppage/
195
     */
196
    private function getDateFromParsely(): ?\DateTime {
197
        $dt = null;
198
199
        // JSON-LD
200
        $nodes = $this->article()->getRawDoc()->find('script[type="application/ld+json"]');
201
202
        /* @var $node Element */
203
        foreach ($nodes as $node) {
204
            try {
205
                $json = json_decode($node->text());
206
                if (isset($json->dateCreated)) {
207
                    $dt = new \DateTime($json->dateCreated);
208
                    break;
209
                }
210
            }
211
            catch (\Exception $e) {
212
                // Do nothing here in case the node has unrecognizable date information.
213
            }
214
        }
215
216
        if (!is_null($dt)) {
217
            return $dt;
218
        }
219
220
        // <meta> tags
221
        $nodes = $this->article()->getRawDoc()->find('meta[name="parsely-pub-date"]');
222
223
        /* @var $node Element */
224
        foreach ($nodes as $node) {
225
            try {
226
                if ($node->hasAttribute('content')) {
227
                    $dt = new \DateTime($node->getAttribute('content'));
228
                    break;
229
                }
230
            }
231
            catch (\Exception $e) {
232
                // Do nothing here in case the node has unrecognizable date information.
233
            }
234
        }
235
236
        if (!is_null($dt)) {
237
            return $dt;
238
        }
239
240
        // parsely-page
241
        $nodes = $this->article()->getRawDoc()->find('meta[name="parsely-page"]');
242
243
        /* @var $node Element */
244
        foreach ($nodes as $node) {
245
            try {
246
                if ($node->hasAttribute('content')) {
247
                    $json = json_decode($node->getAttribute('content'));
248
                    if (isset($json->pub_date)) {
249
                        $dt = new \DateTime($json->pub_date);
250
                        break;
251
                    }
252
                }
253
            }
254
            catch (\Exception $e) {
255
                // Do nothing here in case the node has unrecognizable date information.
256
            }
257
        }
258
259
        return $dt;
260
    }
261
}
262