1
|
|
|
""" |
2
|
|
|
ETLT |
3
|
|
|
|
4
|
|
|
Copyright 2016 Set Based IT Consultancy |
5
|
|
|
|
6
|
|
|
Licence MIT |
7
|
|
|
""" |
8
|
1 |
|
import re |
9
|
|
|
|
10
|
|
|
|
11
|
1 |
View Code Duplication |
class DateCleaner: |
|
|
|
|
12
|
|
|
""" |
13
|
|
|
Utility class for converting dates in miscellaneous formats to ISO-8601 (YYYY-MM-DD) format. |
14
|
|
|
""" |
15
|
|
|
|
16
|
|
|
# ------------------------------------------------------------------------------------------------------------------ |
17
|
1 |
|
month_map = { |
18
|
|
|
# English |
19
|
|
|
'jan': '01', |
20
|
|
|
'feb': '02', |
21
|
|
|
'mar': '03', |
22
|
|
|
'apr': '04', |
23
|
|
|
'may': '05', |
24
|
|
|
'jun': '06', |
25
|
|
|
'jul': '07', |
26
|
|
|
'aug': '08', |
27
|
|
|
'sep': '09', |
28
|
|
|
'oct': '10', |
29
|
|
|
'nov': '11', |
30
|
|
|
'dec': '12', |
31
|
|
|
|
32
|
|
|
# Dutch |
33
|
|
|
'mrt': '03', |
34
|
|
|
'mei': '05', |
35
|
|
|
'okt': '10' |
36
|
|
|
} |
37
|
|
|
|
38
|
|
|
# ------------------------------------------------------------------------------------------------------------------ |
39
|
1 |
|
@staticmethod |
40
|
1 |
|
def clean(date, ignore_time=False): |
41
|
|
|
""" |
42
|
|
|
Converts a date in miscellaneous format to ISO-8601 (YYYY-MM-DD) format. |
43
|
|
|
|
44
|
|
|
:param str date: The input date. |
45
|
|
|
:param bool ignore_time: If true any trailing time prt is ignore. |
46
|
|
|
|
47
|
|
|
:rtype: str |
48
|
|
|
""" |
49
|
|
|
# Return empty input immediately. |
50
|
1 |
|
if not date: |
51
|
1 |
|
return date |
52
|
|
|
|
53
|
1 |
|
parts = re.split(r'[\-/. ]', date) |
54
|
|
|
|
55
|
1 |
|
if (len(parts) == 3) or \ |
56
|
|
|
(len(parts) > 3 and ignore_time) or \ |
57
|
|
|
(len(parts) == 4 and re.match(r'^[0:]*$', parts[3])) or \ |
58
|
|
|
(len(parts) == 5 and re.match(r'^[0:]*$', parts[3]) and re.match(r'^0*$', parts[4])): |
59
|
1 |
|
if len(parts[0]) == 4 and len(parts[1]) <= 2 and len(parts[2]) <= 2: |
60
|
|
|
# Assume date is in YYYY-MM-DD of YYYY-M-D format. |
61
|
1 |
|
return parts[0] + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[2])[-2:] |
62
|
|
|
|
63
|
1 |
|
if len(parts[0]) <= 2 and len(parts[1]) <= 2 and len(parts[2]) == 4: |
64
|
|
|
# Assume date is in DD-MM-YYYY or D-M-YYYY format. |
65
|
1 |
|
return parts[2] + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[0])[-2:] |
66
|
|
|
|
67
|
1 |
|
if len(parts[0]) <= 2 and len(parts[1]) <= 2 and len(parts[2]) == 2: |
68
|
|
|
# Assume date is in DD-MM-YY or D-M-YY format. |
69
|
1 |
|
year = '19' + parts[2] if parts[2] >= '20' else '20' + parts[2] |
70
|
|
|
|
71
|
1 |
|
return year + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[0])[-2:] |
72
|
|
|
|
73
|
|
|
# Try DDmonYYYY or DDmonYYYY HH:mm:ss format |
74
|
1 |
|
pattern = r'^(\d{2})([a-z]{3})(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2})\D(\d{1,2}))?$') |
75
|
1 |
|
match = re.match(pattern, date.lower()) |
76
|
1 |
|
if match and match.group(2) in DateCleaner.month_map: |
77
|
1 |
|
ret = match.group(3) + '-' + DateCleaner.month_map[match.group(2)] + '-' + match.group(1) |
78
|
1 |
|
if len(match.groups()) == 7 and match.group(4): |
79
|
1 |
|
ret += 'T' + match.group(5) + ':' + match.group(6) + ':' + match.group(7) |
80
|
1 |
|
return ret |
81
|
|
|
|
82
|
|
|
# Try YYYYMMDD format. |
83
|
1 |
|
pattern = r'^\d{8}' + ('.*$' if ignore_time else '$') |
84
|
1 |
|
match = re.match(pattern, date) |
85
|
1 |
|
if match: |
86
|
|
|
# Assume date is YYYYMMDD format |
87
|
1 |
|
return date[0:4] + '-' + date[4:6] + '-' + date[6:8] |
88
|
|
|
|
89
|
|
|
# Format not recognized. Just return the original string. |
90
|
1 |
|
return date |
91
|
|
|
|
92
|
|
|
# ---------------------------------------------------------------------------------------------------------------------- |
93
|
|
|
|