1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Magium\Extractors; |
4
|
|
|
|
5
|
|
|
class DateTime extends AbstractExtractor |
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
protected $text; |
9
|
|
|
protected $dateString; |
10
|
|
|
|
11
|
|
|
public function setText($text) |
12
|
|
|
{ |
13
|
|
|
$this->text = $text; |
14
|
|
|
return $this; |
15
|
|
|
} |
16
|
|
|
|
17
|
|
|
public function getDateString() |
18
|
|
|
{ |
19
|
|
|
return $this->dateString; |
20
|
|
|
} |
21
|
|
|
|
22
|
|
|
public function extract() |
23
|
|
|
{ |
24
|
|
|
$matchedParts = [ |
25
|
|
|
'month' => null, |
26
|
|
|
'day' => null, |
27
|
|
|
'year' => null, |
28
|
|
|
'hour' => null, |
29
|
|
|
'minute' => null, |
30
|
|
|
'seconds' => null, |
31
|
|
|
'timezone' => null, |
32
|
|
|
'meridiem' => null, |
33
|
|
|
]; |
34
|
|
|
$text = $this->text; |
35
|
|
|
// replace all non-whitespace with space |
36
|
|
|
$text = preg_replace('/[^\w:\/\\-]+/', ' ', $text); |
37
|
|
|
// normalize spaces |
38
|
|
|
$text = preg_replace('/\s+/', ' ', $text); |
39
|
|
|
$parts = explode(' ', $text); |
40
|
|
|
foreach ($parts as $key => $part) { |
41
|
|
|
foreach ($matchedParts as $term => $value) { |
42
|
|
|
/* |
43
|
|
|
* We re-run the test method each time because Dec 01 01:01:01 could match only the first 01. So we |
44
|
|
|
* have to be a bit greedy here. |
45
|
|
|
*/ |
46
|
|
|
$fn = 'test' . ucfirst($term); |
47
|
|
|
if ($this->$fn($part) && $matchedParts[$term] === null) { |
48
|
|
|
$matchedParts[$term] = $key; |
49
|
|
|
} |
50
|
|
|
|
51
|
|
|
} |
52
|
|
|
} |
53
|
|
|
$max = false; |
54
|
|
|
$min = PHP_INT_MAX; |
55
|
|
|
foreach ($matchedParts as $part) { |
56
|
|
|
if ($part === null) continue; |
57
|
|
|
if ($part >= $max) { |
58
|
|
|
$max = $part; |
59
|
|
|
} |
60
|
|
|
if ($part <= $min) { |
61
|
|
|
$min = $part; |
62
|
|
|
} |
63
|
|
|
} |
64
|
|
|
if ($max === false) { |
65
|
|
|
// Didn't find a date |
66
|
|
|
return; |
67
|
|
|
} |
68
|
|
|
$foundDate = ''; |
69
|
|
|
for ($i = (int)$min; $i <= (int)$max; $i++) { |
70
|
|
|
$foundDate .= ' ' . $parts[$i]; |
71
|
|
|
} |
72
|
|
|
$parse = date_parse(trim($foundDate)); |
73
|
|
|
if (isset($parse['error_count']) && $parse['error_count'] == 0) { |
74
|
|
|
$start = strpos($this->text, $parts[$min]); |
75
|
|
|
$end = strpos($this->text, $parts[$max]) + strlen($parts[$max]); |
76
|
|
|
$this->dateString = substr($this->text, $start, ($end - $start)); |
77
|
|
|
} |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
private static $timeZoneList = null; |
81
|
|
|
|
82
|
|
|
protected function testTimezone($part) |
83
|
|
|
{ |
84
|
|
|
if (self::$timeZoneList === null) { |
85
|
|
|
$idArray = array_keys(timezone_abbreviations_list()); |
86
|
|
|
self::$timeZoneList = []; |
87
|
|
View Code Duplication |
foreach ($idArray as $id) { |
|
|
|
|
88
|
|
|
if (strlen($id) > 1) { // 'a' is really a timezone? |
89
|
|
|
self::$timeZoneList[] = strtolower($id); |
90
|
|
|
self::$timeZoneList[] = strtoupper($id); |
91
|
|
|
} |
92
|
|
|
} |
93
|
|
|
$idArray = \DateTimeZone::listIdentifiers(); |
94
|
|
View Code Duplication |
foreach ($idArray as $id) { |
|
|
|
|
95
|
|
|
self::$timeZoneList[] = strtolower($id); |
96
|
|
|
self::$timeZoneList[] = strtoupper($id); |
97
|
|
|
self::$timeZoneList[] = $id; // Items like 'America/Chicago' should be left as is. |
98
|
|
|
} |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
return in_array($part, self::$timeZoneList); |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
protected function testMeridiem($part) |
105
|
|
|
{ |
106
|
|
|
$m = ['am', 'AM', 'PM', 'pm']; |
107
|
|
|
return in_array($part, $m); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
View Code Duplication |
protected function testDay($part) |
|
|
|
|
111
|
|
|
{ |
112
|
|
|
if (ctype_digit($part) && $part <= 31) { |
113
|
|
|
return true; |
114
|
|
|
} |
115
|
|
|
$parse = date_parse($part); |
116
|
|
|
if (!empty($parse) && $parse['day'] !== false && $parse['error_count'] == 0 ) { |
117
|
|
|
return true; |
118
|
|
|
} else if (preg_match('/^\d{1,2}[\/\\-]\d{1,2}[\/\\-]\d{1,2}$/', $part)) { |
119
|
|
|
return true; |
120
|
|
|
} |
121
|
|
|
return false; |
122
|
|
|
} |
123
|
|
|
|
124
|
|
View Code Duplication |
protected function testMonth($part) |
|
|
|
|
125
|
|
|
{ |
126
|
|
|
if (ctype_digit($part) && $part <= 12) { |
127
|
|
|
return true; |
128
|
|
|
} |
129
|
|
|
$parse = date_parse($part); |
130
|
|
|
if (!empty($parse) && $parse['month'] !== false && $parse['error_count'] == 0 ) { |
131
|
|
|
return true; |
132
|
|
|
} else if (preg_match('/^\d{1,2}[\/\\-]\d{1,2}[\/\\-]\d{1,2}$/', $part)) { |
133
|
|
|
return true; |
134
|
|
|
} |
135
|
|
|
return false; |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
protected function testYear($part) |
139
|
|
|
{ |
140
|
|
|
$parse = date_parse($part); |
141
|
|
|
if (!empty($parse) && $parse['year'] !== false && $parse['error_count'] == 0) { |
142
|
|
|
return true; |
143
|
|
|
// DateTime parses 2015 as 20:15:00, so this is to account for the unique condition. |
144
|
|
|
} else if (strlen($part) ==4 ) { |
145
|
|
|
if ($parse['second'] === 0 && $parse['hour'] == substr($part, 0, 2) && $parse['minute'] == substr($part, 2, 2)) { |
146
|
|
|
return true; |
147
|
|
|
} |
148
|
|
|
} else if (preg_match('/^\d{1,2}[\/\\-]\d{1,2}[\/\\-]\d{2,4}$/', $part)) { |
149
|
|
|
return true; |
150
|
|
|
} |
151
|
|
|
return false; |
152
|
|
|
} |
153
|
|
|
|
154
|
|
View Code Duplication |
protected function testHour($part) |
|
|
|
|
155
|
|
|
{ |
156
|
|
|
// Sometimes years get parsed as hours |
157
|
|
|
if (ctype_digit($part) && $part <= 23) { |
158
|
|
|
return true; |
159
|
|
|
} else if (preg_match('/^\d{1,2}:\d{1,2}:\d{1,2}$/', $part)) { |
160
|
|
|
return true; |
161
|
|
|
} else if (ctype_digit($part) && $part > 24) { |
162
|
|
|
return false; |
163
|
|
|
} |
164
|
|
|
$parse = date_parse($part); |
165
|
|
|
if (!empty($parse) && $parse['hour'] !== false && $parse['error_count'] == 0 ) { |
166
|
|
|
return true; |
167
|
|
|
} |
168
|
|
|
return false; |
169
|
|
|
} |
170
|
|
|
|
171
|
|
View Code Duplication |
protected function testMinute($part) |
|
|
|
|
172
|
|
|
{ |
173
|
|
|
// Sometimes years get parsed as minutes |
174
|
|
|
if (ctype_digit($part) && $part <= 60) { |
175
|
|
|
return true; |
176
|
|
|
} else if (preg_match('/^\d{1,2}:\d{1,2}:\d{1,2}$/', $part)) { |
177
|
|
|
return true; |
178
|
|
|
} else if (ctype_digit($part) && $part > 60) { |
179
|
|
|
return false; |
180
|
|
|
} |
181
|
|
|
$parse = date_parse($part); |
182
|
|
|
if (!empty($parse) && $parse['minute'] !== false && $parse['error_count'] == 0 ) { |
183
|
|
|
return true; |
184
|
|
|
|
185
|
|
|
} |
186
|
|
|
return false; |
187
|
|
|
} |
188
|
|
|
|
189
|
|
View Code Duplication |
protected function testSeconds($part) |
|
|
|
|
190
|
|
|
{ |
191
|
|
|
// Sometimes years get parsed as seconds |
192
|
|
|
if (ctype_digit($part) && $part <= 60) { |
193
|
|
|
return true; |
194
|
|
|
} else if (preg_match('/^\d{1,2}:\d{1,2}:\d{1,2}$/', $part)) { |
195
|
|
|
return true; |
196
|
|
|
} else if (ctype_digit($part) && $part > 60) { |
197
|
|
|
return false; |
198
|
|
|
} |
199
|
|
|
$parse = date_parse($part); |
200
|
|
|
if (!empty($parse) && $parse['second'] !== false && $parse['error_count'] == 0 ) { |
201
|
|
|
return true; |
202
|
|
|
} |
203
|
|
|
return false; |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
} |
207
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.