1
|
1 |
|
import re |
2
|
1 |
|
from typing import Optional |
3
|
|
|
|
4
|
|
|
|
5
|
1 |
|
class DateCleaner: |
6
|
|
|
""" |
7
|
|
|
Utility class for converting dates in miscellaneous formats to ISO-8601 (YYYY-MM-DD) format. |
8
|
|
|
""" |
9
|
|
|
|
10
|
|
|
# ------------------------------------------------------------------------------------------------------------------ |
11
|
1 |
|
month_map = { |
12
|
|
|
# English |
13
|
|
|
'jan': '01', |
14
|
|
|
'feb': '02', |
15
|
|
|
'mar': '03', |
16
|
|
|
'apr': '04', |
17
|
|
|
'may': '05', |
18
|
|
|
'jun': '06', |
19
|
|
|
'jul': '07', |
20
|
|
|
'aug': '08', |
21
|
|
|
'sep': '09', |
22
|
|
|
'oct': '10', |
23
|
|
|
'nov': '11', |
24
|
|
|
'dec': '12', |
25
|
|
|
|
26
|
|
|
# Dutch |
27
|
|
|
'mrt': '03', |
28
|
|
|
'mei': '05', |
29
|
|
|
'okt': '10' |
30
|
|
|
} |
31
|
|
|
|
32
|
|
|
# ------------------------------------------------------------------------------------------------------------------ |
33
|
1 |
|
@staticmethod |
34
|
1 |
|
def clean(date: Optional[str], ignore_time: bool = False) -> Optional[str]: |
35
|
|
|
""" |
36
|
|
|
Converts a date in miscellaneous format to ISO-8601 (YYYY-MM-DD) format. |
37
|
|
|
|
38
|
|
|
:param date: The input date. |
39
|
|
|
:param ignore_time: Whether any trailing time part must be ignored. |
40
|
|
|
""" |
41
|
|
|
# Return empty input immediately. |
42
|
|
|
if not date: |
43
|
|
|
return date |
44
|
1 |
|
|
45
|
1 |
|
parts = re.split(r'[\-/. ]', date) |
46
|
|
|
|
47
|
1 |
|
if (len(parts) == 3) or \ |
48
|
|
|
(len(parts) > 3 and ignore_time) or \ |
49
|
1 |
|
(len(parts) == 4 and re.match(r'^[0:]*$', parts[3])) or \ |
50
|
|
|
(len(parts) == 5 and re.match(r'^[0:]*$', parts[3]) and re.match(r'^0*$', parts[4])): |
51
|
|
|
if len(parts[0]) == 4 and len(parts[1]) <= 2 and len(parts[2]) <= 2: |
52
|
|
|
# Assume date is in YYYY-MM-DD of YYYY-M-D format. |
53
|
1 |
|
return parts[0] + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[2])[-2:] |
54
|
|
|
|
55
|
1 |
|
if len(parts[0]) <= 2 and len(parts[1]) <= 2 and len(parts[2]) == 4: |
56
|
|
|
# Assume date is in DD-MM-YYYY or D-M-YYYY format. |
57
|
1 |
|
return parts[2] + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[0])[-2:] |
58
|
|
|
|
59
|
1 |
|
if len(parts[0]) <= 2 and len(parts[1]) <= 2 and len(parts[2]) == 2: |
60
|
|
|
# Assume date is in DD-MM-YY or D-M-YY format. |
61
|
1 |
|
year = '19' + parts[2] if parts[2] >= '20' else '20' + parts[2] |
62
|
|
|
|
63
|
1 |
|
return year + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[0])[-2:] |
64
|
|
|
|
65
|
1 |
|
# Try DD-MM-YYYY HH:mm:ss format |
66
|
|
|
pattern = r'^(\d{2})\D(\d{2})\D(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2})\D(\d{1,2}))?$') |
67
|
|
|
match = re.match(pattern, date) |
68
|
1 |
|
if match: |
69
|
1 |
|
ret = match.group(3) + '-' + match.group(2) + '-' + match.group(1) |
70
|
1 |
|
if len(match.groups()) == 7 and match.group(4): |
71
|
1 |
|
ret += 'T' + match.group(5) + ':' + match.group(6) + ':' + match.group(7) |
72
|
1 |
|
return ret |
73
|
1 |
|
|
74
|
1 |
|
# Try DD-MM-YYYY HH:mm format |
75
|
|
|
pattern = r'^(\d{2})\D(\d{2})\D(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2}))?$') |
76
|
|
|
match = re.match(pattern, date) |
77
|
1 |
|
if match: |
78
|
1 |
|
ret = match.group(3) + '-' + match.group(2) + '-' + match.group(1) |
79
|
1 |
|
if len(match.groups()) == 6 and match.group(4): |
80
|
|
|
ret += 'T' + match.group(5) + ':' + match.group(6) + ':00' |
81
|
1 |
|
return ret |
82
|
|
|
|
83
|
|
|
# Try DDmonYYYY or DDmonYYYY HH:mm:ss format |
84
|
1 |
|
pattern = r'^(\d{2})([a-z]{3})(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2})\D(\d{1,2}))?$') |
85
|
|
|
match = re.match(pattern, date.lower()) |
86
|
|
|
if match and match.group(2) in DateCleaner.month_map: |
87
|
|
|
ret = match.group(3) + '-' + DateCleaner.month_map[match.group(2)] + '-' + match.group(1) |
88
|
|
|
if len(match.groups()) == 7 and match.group(4): |
89
|
|
|
ret += 'T' + match.group(5) + ':' + match.group(6) + ':' + match.group(7) |
90
|
|
|
return ret |
91
|
|
|
|
92
|
|
|
# Try YYYYMMDD format. |
93
|
|
|
pattern = r'^\d{8}' + ('.*$' if ignore_time else '$') |
94
|
|
|
match = re.match(pattern, date) |
95
|
|
|
if match: |
96
|
|
|
# Assume date is YYYYMMDD format |
97
|
|
|
return date[0:4] + '-' + date[4:6] + '-' + date[6:8] |
98
|
|
|
|
99
|
|
|
# Format not recognized. Just return the original string. |
100
|
|
|
return date |
101
|
|
|
|
102
|
|
|
# ---------------------------------------------------------------------------------------------------------------------- |
103
|
|
|
|