1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace ValueParsers; |
4
|
|
|
|
5
|
|
|
use DataValues\TimeValue; |
6
|
|
|
use DateTime; |
7
|
|
|
use Exception; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Time parser using PHP's DateTime object. Since the behavior of PHP's parser can be quite odd |
11
|
|
|
* (for example, it pads missing elements with the current date and does actual calculations such as |
12
|
|
|
* parsing "2015-00-00" as "2014-12-30") this parser should only be used as a fallback. |
13
|
|
|
* |
14
|
|
|
* This class implements heuristics to guess which sequence of digits in the input represents the |
15
|
|
|
* year. This is relevant because PHP's parser can only handle 4-digit years as expected. The |
16
|
|
|
* following criteria are used to identify the year: |
17
|
|
|
* |
18
|
|
|
* - The first number longer than 2 digits or bigger than 59. |
19
|
|
|
* - The first number in the input, if it is bigger than 31. |
20
|
|
|
* - The third of three space-separated parts at the beginning of the input, if it is a number. |
21
|
|
|
* - The third number in the input. |
22
|
|
|
* - The last number in the input otherwise. |
23
|
|
|
* |
24
|
|
|
* @since 0.7 |
25
|
|
|
* |
26
|
|
|
* @license GPL-2.0+ |
27
|
|
|
* @author Addshore |
28
|
|
|
* @author Thiemo Kreuz |
29
|
|
|
*/ |
30
|
|
|
class PhpDateTimeParser extends StringValueParser { |
31
|
|
|
|
32
|
|
|
const FORMAT_NAME = 'php-date-time'; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* @var MonthNameUnlocalizer |
36
|
|
|
*/ |
37
|
|
|
private $monthNameUnlocalizer; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* @var ValueParser |
41
|
|
|
*/ |
42
|
|
|
private $eraParser; |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* @var ValueParser |
46
|
|
|
*/ |
47
|
|
|
private $isoTimestampParser; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @param MonthNameUnlocalizer $monthNameUnlocalizer Used to translate month names to English, |
51
|
|
|
* the language PHP's DateTime parser understands. |
52
|
|
|
* @param ValueParser $eraParser String parser that detects signs, "BC" suffixes and such and |
53
|
|
|
* returns an array with the detected sign character and the remaining value. |
54
|
|
|
* @param ValueParser $isoTimestampParser String parser that gets a language independent |
55
|
|
|
* YMD-ordered timestamp and returns a TimeValue object. Used for precision detection. |
56
|
|
|
*/ |
57
|
113 |
|
public function __construct( |
58
|
|
|
MonthNameUnlocalizer $monthNameUnlocalizer, |
59
|
|
|
ValueParser $eraParser, |
60
|
|
|
ValueParser $isoTimestampParser |
61
|
|
|
) { |
62
|
113 |
|
parent::__construct(); |
63
|
|
|
|
64
|
113 |
|
$this->monthNameUnlocalizer = $monthNameUnlocalizer; |
65
|
113 |
|
$this->eraParser = $eraParser; |
66
|
113 |
|
$this->isoTimestampParser = $isoTimestampParser; |
67
|
113 |
|
} |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* @param string $value in a format as specified by the PHP DateTime object |
71
|
|
|
* there are exceptions as we can handel 5+ digit dates |
72
|
|
|
* |
73
|
|
|
* @throws ParseException |
74
|
|
|
* @return TimeValue |
75
|
|
|
*/ |
76
|
106 |
|
protected function stringParse( $value ) { |
77
|
106 |
|
$rawValue = $value; |
78
|
|
|
|
79
|
|
|
try { |
80
|
106 |
|
list( $sign, $value ) = $this->eraParser->parse( $value ); |
81
|
|
|
|
82
|
106 |
|
$value = trim( $value ); |
83
|
106 |
|
$value = $this->monthNameUnlocalizer->unlocalize( $value ); |
84
|
106 |
|
$year = $this->fetchAndNormalizeYear( $value ); |
85
|
|
|
|
86
|
106 |
|
$value = $this->getValueWithFixedSeparators( $value, $year ); |
87
|
|
|
|
88
|
106 |
|
$this->validateDateTimeInput( $value ); |
89
|
|
|
|
90
|
|
|
// Parse using the DateTime object (this will allow us to format the date in a nicer way) |
91
|
91 |
|
$dateTime = new DateTime( $value ); |
92
|
|
|
|
93
|
|
|
// Fail if the DateTime object does calculations like changing 2015-00-00 to 2014-12-30. |
94
|
84 |
|
if ( $year !== null && $dateTime->format( 'Y' ) !== substr( $year, -4 ) ) { |
95
|
7 |
|
throw new ParseException( $value . ' is not a valid date.' ); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
// Input was one, two, or three numbers? Where the heck does a time come from? |
99
|
77 |
|
if ( $dateTime->format( 'H:i:s' ) !== '00:00:00' |
100
|
77 |
|
&& preg_match( '/^\D*\d+(?:\D+\d+){0,2}\D*$/', $value ) |
101
|
|
|
) { |
102
|
1 |
|
throw new ParseException( $value . ' is not a valid date.' ); |
103
|
|
|
} |
104
|
|
|
|
105
|
76 |
|
if ( $year !== null && strlen( $year ) > 4 ) { |
106
|
12 |
|
$timestamp = $sign . $year . $dateTime->format( '-m-d\TH:i:s\Z' ); |
107
|
|
|
} else { |
108
|
64 |
|
$timestamp = $sign . $dateTime->format( 'Y-m-d\TH:i:s\Z' ); |
109
|
|
|
} |
110
|
|
|
|
111
|
|
|
// Use a common base parser for precision detection and option handling. |
112
|
76 |
|
return $this->isoTimestampParser->parse( $timestamp ); |
113
|
30 |
|
} catch ( Exception $exception ) { |
114
|
30 |
|
throw new ParseException( $exception->getMessage(), $rawValue, self::FORMAT_NAME ); |
115
|
|
|
} |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* @param string $value |
120
|
|
|
* |
121
|
|
|
* @throws ParseException |
122
|
|
|
*/ |
123
|
106 |
|
private function validateDateTimeInput( $value ) { |
124
|
|
|
// we don't support input of non-digits only, such as 'x'. |
125
|
106 |
|
if ( !preg_match( '/\d/', $value ) ) { |
126
|
5 |
|
throw new ParseException( $value . ' does not contain a digit.' ); |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
// @todo i18n support for these exceptions |
130
|
|
|
// we don't support dates in format of year + timezone |
131
|
101 |
|
if ( preg_match( '/^\d{1,7}(\+\d*|\D*)$/', $value ) ) { |
132
|
10 |
|
throw new ParseException( $value . ' is not a valid date.' ); |
133
|
|
|
} |
134
|
91 |
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* PHP's DateTime object does not accept spaces as separators between year, month and day, |
138
|
|
|
* e.g. dates like 20 12 2012, but we want to support them. |
139
|
|
|
* See http://de1.php.net/manual/en/datetime.formats.date.php |
140
|
|
|
* |
141
|
|
|
* @param string $value |
142
|
|
|
* @param string|null $year |
143
|
|
|
* |
144
|
|
|
* @return string |
145
|
|
|
*/ |
146
|
106 |
|
private function getValueWithFixedSeparators( $value, $year = null ) { |
147
|
106 |
|
$isYmd = $year !== null && preg_match( '/^\D*' . $year . '\D+\d+\D+\d+\D*$/', $value ); |
148
|
106 |
|
$separator = $isYmd ? '-' : '.'; |
149
|
|
|
// Meant to match separator characters after day and month. \p{L} matches letters outside |
150
|
|
|
// the ASCII range. |
151
|
106 |
|
return preg_replace( '/(?<=[\d\p{L}])[.,\s]\s*/', $separator, $value ); |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* Tries to find and pad the sequence of digits in the input that represents the year. |
156
|
|
|
* Refer to the class level documentation for a description of the heuristics used. |
157
|
|
|
* |
158
|
|
|
* @param string &$value A time value string, possibly containing a year. If found, the year in |
159
|
|
|
* the string will be cut and padded to exactly 4 digits. |
160
|
|
|
* |
161
|
|
|
* @return string|null The full year, if found, not cut but padded to at least 4 digits. |
162
|
|
|
*/ |
163
|
106 |
|
private function fetchAndNormalizeYear( &$value ) { |
164
|
|
|
// NOTE: When changing the regex matching below, keep the class level |
165
|
|
|
// documentation of the extraction heuristics up to date! |
166
|
|
|
$patterns = array( |
167
|
|
|
// Check if the string contains a number longer than 2 digits or bigger than 59. |
168
|
106 |
|
'/(?<!\d)(' // can not be prepended by a digit |
169
|
|
|
. '\d{3,}|' // any number longer than 2 digits, or |
170
|
|
|
. '[6-9]\d' // any number bigger than 59 |
171
|
|
|
. ')(?!\d)/', // can not be followed by a digit |
172
|
|
|
|
173
|
|
|
// Check if the first number in the string is bigger than 31. |
174
|
|
|
'/^\D*(3[2-9]|[4-9]\d)/', |
175
|
|
|
|
176
|
|
|
// Check if the string starts with three space-separated parts or three numbers. |
177
|
|
|
'/^(?:' |
178
|
|
|
. '\S+\s+\S+\s+|' // e.g. "July<SPACE>4th<SPACE>", or |
179
|
|
|
. '\d+\D+\d+\D+' // e.g. "4.7." |
180
|
|
|
. ')(\d+)/', // followed by a number |
181
|
|
|
|
182
|
|
|
// Check if the string ends with a number. |
183
|
|
|
'/(\d+)\D*$/', |
184
|
|
|
); |
185
|
|
|
|
186
|
106 |
|
foreach ( $patterns as $pattern ) { |
187
|
106 |
|
if ( preg_match( $pattern, $value, $matches, PREG_OFFSET_CAPTURE ) ) { |
188
|
106 |
|
break; |
189
|
|
|
} |
190
|
|
|
} |
191
|
|
|
|
192
|
106 |
|
if ( !isset( $matches[1] ) ) { |
193
|
5 |
|
return null; |
194
|
|
|
} |
195
|
|
|
|
196
|
101 |
|
$year = $matches[1][0]; |
197
|
101 |
|
$index = $matches[1][1]; |
198
|
101 |
|
$length = strlen( $year ); |
199
|
|
|
|
200
|
|
|
// Trim irrelevant leading zeros. |
201
|
101 |
|
$year = ltrim( $year, '0' ); |
202
|
|
|
|
203
|
|
|
// Pad to at least 4 digits. |
204
|
101 |
|
$year = str_pad( $year, 4, '0', STR_PAD_LEFT ); |
205
|
|
|
|
206
|
|
|
// Manipulate the value to have an exactly 4-digit year. Crucial for PHP's DateTime object. |
207
|
101 |
|
$value = substr_replace( $value, substr( $year, -4 ), $index, $length ); |
208
|
|
|
|
209
|
101 |
|
return $year; |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
} |
213
|
|
|
|