etlt.cleaner.DateCleaner   A
last analyzed

Complexity

Total Complexity 35

Size/Duplication

Total Lines 101
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 62
dl 0
loc 101
ccs 29
cts 29
cp 1
rs 9.6
c 0
b 0
f 0
wmc 35

1 Method

Rating   Name   Duplication   Size   Complexity  
F DateCleaner.clean() 0 68 35
1 1
import re
2 1
from typing import Optional
3
4
5 1
class DateCleaner:
6
    """
7
    Utility class for converting dates in miscellaneous formats to ISO-8601 (YYYY-MM-DD) format.
8
    """
9
10
    # ------------------------------------------------------------------------------------------------------------------
11 1
    month_map = {
12
            # English
13
            'jan': '01',
14
            'feb': '02',
15
            'mar': '03',
16
            'apr': '04',
17
            'may': '05',
18
            'jun': '06',
19
            'jul': '07',
20
            'aug': '08',
21
            'sep': '09',
22
            'oct': '10',
23
            'nov': '11',
24
            'dec': '12',
25
26
            # Dutch
27
            'mrt': '03',
28
            'mei': '05',
29
            'okt': '10'
30
    }
31
32
    # ------------------------------------------------------------------------------------------------------------------
33 1
    @staticmethod
34 1
    def clean(date: Optional[str], ignore_time: bool = False) -> Optional[str]:
35
        """
36
        Converts a date in miscellaneous format to ISO-8601 (YYYY-MM-DD) format.
37
38
        :param date: The input date.
39
        :param ignore_time: Whether any trailing time part must be ignored.
40
        """
41
        # Return empty input immediately.
42
        if not date:
43
            return date
44 1
45 1
        parts = re.split(r'[\-/. ]', date)
46
47 1
        if (len(parts) == 3) or \
48
                (len(parts) > 3 and ignore_time) or \
49 1
                (len(parts) == 4 and re.match(r'^[0:]*$', parts[3])) or \
50
                (len(parts) == 5 and re.match(r'^[0:]*$', parts[3]) and re.match(r'^0*$', parts[4])):
51
            if len(parts[0]) == 4 and len(parts[1]) <= 2 and len(parts[2]) <= 2:
52
                # Assume date is in  YYYY-MM-DD of YYYY-M-D format.
53 1
                return parts[0] + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[2])[-2:]
54
55 1
            if len(parts[0]) <= 2 and len(parts[1]) <= 2 and len(parts[2]) == 4:
56
                # Assume date is in  DD-MM-YYYY or D-M-YYYY format.
57 1
                return parts[2] + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[0])[-2:]
58
59 1
            if len(parts[0]) <= 2 and len(parts[1]) <= 2 and len(parts[2]) == 2:
60
                # Assume date is in  DD-MM-YY or D-M-YY format.
61 1
                year = '19' + parts[2] if parts[2] >= '20' else '20' + parts[2]
62
63 1
                return year + '-' + ('00' + parts[1])[-2:] + '-' + ('00' + parts[0])[-2:]
64
65 1
        # Try DD-MM-YYYY HH:mm:ss format
66
        pattern = r'^(\d{2})\D(\d{2})\D(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2})\D(\d{1,2}))?$')
67
        match = re.match(pattern, date)
68 1
        if match:
69 1
            ret = match.group(3) + '-' + match.group(2) + '-' + match.group(1)
70 1
            if len(match.groups()) == 7 and match.group(4):
71 1
                ret += 'T' + match.group(5) + ':' + match.group(6) + ':' + match.group(7)
72 1
            return ret
73 1
74 1
        # Try DD-MM-YYYY HH:mm format
75
        pattern = r'^(\d{2})\D(\d{2})\D(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2}))?$')
76
        match = re.match(pattern, date)
77 1
        if match:
78 1
            ret = match.group(3) + '-' + match.group(2) + '-' + match.group(1)
79 1
            if len(match.groups()) == 6 and match.group(4):
80
                ret += 'T' + match.group(5) + ':' + match.group(6) + ':00'
81 1
            return ret
82
83
        # Try DDmonYYYY or DDmonYYYY HH:mm:ss format
84 1
        pattern = r'^(\d{2})([a-z]{3})(\d{4})' + ('.*$' if ignore_time else r'(\D(\d{1,2})\D(\d{1,2})\D(\d{1,2}))?$')
85
        match = re.match(pattern, date.lower())
86
        if match and match.group(2) in DateCleaner.month_map:
87
            ret = match.group(3) + '-' + DateCleaner.month_map[match.group(2)] + '-' + match.group(1)
88
            if len(match.groups()) == 7 and match.group(4):
89
                ret += 'T' + match.group(5) + ':' + match.group(6) + ':' + match.group(7)
90
            return ret
91
92
        # Try YYYYMMDD format.
93
        pattern = r'^\d{8}' + ('.*$' if ignore_time else '$')
94
        match = re.match(pattern, date)
95
        if match:
96
            # Assume date is YYYYMMDD format
97
            return date[0:4] + '-' + date[4:6] + '-' + date[6:8]
98
99
        # Format not recognized. Just return the original string.
100
        return date
101
102
# ----------------------------------------------------------------------------------------------------------------------
103