RegexSequence.search_n_dict()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 14
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 11
nop 2
dl 0
loc 14
ccs 6
cts 6
cp 1
crap 2
rs 9.85
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3 1
import os
4 1
import re
5 1
import time
6
7
8 1
class StringToDictParser(object):
9
    """Parses album information out of video title string"""
10 1
11
    check = re.compile(r'^s([1-9]\d*)$')
12 1
13 1
    def __init__(self, entities, separators):
14
        if not all(type(x) == str for x in separators):
15 1
            raise RuntimeError
16 1
        self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()}
17
        self.separators = separators
18 1
19 1
    def __call__(self, *args, **kwargs):
20 1
        title = args[0]
21 1
        design = kwargs['design']
22
        if not all(
23 1
            0 <= len(x) <= len(self.entities) + len(self.separators)
24
            and all(type(y) == str for y in x)
25 1
            for x in design
26 1
        ):
27
            raise RuntimeError
28 1
        if not all(
29 1
            all(StringToDictParser.check.match(y) for y in x if y.startswith('s'))
30 1
            for x in design
31 1
        ):
32
            raise RuntimeError
33 1
        rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design]
34
        return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x))
35 1
36 1
    def _yield_reg_comp(self, kati):
37 1
        for k in kati:
38 1
            if k.startswith('s'):
39
                yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1]
40 1
            else:
41 1
                yield self.entities[k]
42
43
44 1
class AlbumInfoEntity(object):
45 1
    def __init__(self, name, reg):
46 1
        self.name = name
47 1
        self.reg = reg
48
49 1
    def __str__(self):
50 1
        return self.reg
51
52
53 1
class RegexSequence(object):
54 1
    def __init__(self, data):
55 1
        self._keys = [d.name for d in data if hasattr(d, 'name')]
56
        self._regex = r'{}'.format(''.join(str(d) for d in data))
57
58
    def search_n_dict(self, string):
59
        return dict(
60
            _
61
            for _ in zip(
62
                self._keys,
63
                list(
64 1
                    getattr(
65 1
                        re.search(self._regex, string),
66 1
                        'groups',
67 1
                        lambda: len(self._keys) * [''],
68 1
                    )()
69
                ),
70 1
            )
71
            if _[1]
72 1
        )
73 1
74 1
75 1
class StringParser(object):
76 1
    __instance = None
77
    # we take care of compiling the below regexes with the re.X flag
78
    # because they contain whitespaces on purpose for better readability
79 1
    # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
80
81
    # r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})"
82
    regexes = {
83
        'track_number': r'\d{1,2}',  # we know this will try to match as many as possible with back-tracking ;-)
84
        'sep1': r"(?: [\t\ ]* [\.\-\,)]+ )? [\t ]*",
85
        'track_word_first_char': r"[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE]",
86
        'track_word_char': r"[\.\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]",
87
        # 'track_word': r"\(?[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE][\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]*\)?",
88
        'track_sep': r'[\t\ ,]+',
89
        'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )',
90
        'extension': r'\.mp[34]',
91 1
        'hhmmss': r'(?:\d?\d:)*\d?\d',
92
    }
93
94
    ## to parse from youtube video title string
95
    sep1 = r'[\t ]*[\-\.][\t ]*'
96 1
    sep2 = r'[\t \-\.]+'
97
    year = r'\(?(\d{4})\)?'
98
    art = r'([\w ]*\w)'
99 1
    alb = r'([\w ]*\w)'
100
101
    album_info_parser = StringToDictParser(
102
        {'artist': art, 'album': alb, 'year': year}, [sep1, sep2]
103
    )
104
105 1
    def __new__(cls, *args, **kwargs):
106
        if not cls.__instance:
107
            cls.__instance = super(cls, StringParser).__new__(cls)
108
            cls.regexes[
109
                'track_word'
110
            ] = r'\(?{track_word_first_char}{track_word_char}*\)?'.format(**cls.regexes)
111 1
            cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format(
112
                **cls.regexes
113 1
            )
114
        return cls.__instance
115
116
    ## STRING TO DICT
117
    @classmethod
118
    def parse_album_info(cls, video_title):
119
        """Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n
120 1
        Can parse patters:
121 1
         - Artist Album Year\n
122 1
         - Artist Album\n
123 1
         - Album Year\n
124 1
         - Album\n
125
        :param str video_title:
126
        :return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'}
127
        :rtype: dict
128
        """
129 1
        return cls.album_info_parser(
130
            video_title,
131
            design=[
132
                ['artist', 's1', 'album', 's2', 'year'],
133
                ['artist', 's1', 'album'],
134
                ['album', 's2', 'year'],
135
                ['album'],
136
            ],
137
        )
138
139
    # Uses the cls.regexes
140
    # PARSE filenames
141
    @classmethod
142 1
    def parse_track_number_n_name(cls, file_name):
143 1
        """Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!"""
144
        return dict(
145
            zip(
146 1
                ['track_number', 'track_name'],
147
                list(
148
                    re.compile(
149
                        r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format(
150
                            **cls.regexes
151
                        ),
152
                        re.X,  # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
153
                    )
154 1
                    .search(os.path.basename(file_name))
155 1
                    .groups()
156 1
                ),
157 1
            )
158 1
        )
159 1
        # return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups())))
160 1
161
    # Uses the cls.regexes
162 1
    @classmethod
163
    def _parse_track_line(cls, track_line):
164
        """
165
        Parses a string line such as '01. Doteru 3:45' into ['Doteru', '3:45']\n
166
        :param track_line:
167
        :return: the parsed items
168
        :rtype: list
169
        """
170 1
        # regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))?  # potential track number (eg 01) included is ignored
171
        #                             ([\w\'\(\) \-’]*[\w)])                       # track name
172
        #                             (?:[\t ]+|[\t ]*[\-\.]+[\t ]*)            # separator between name and time
173 1
        #                             ((?:\d?\d:)*\d?\d)$                       # time in hh:mm:ss format""", re.X)
174
        # regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss))
175
        regex = re.compile(
176 1
            r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format(
177
                **cls.regexes
178 1
            ),
179
            re.X,
180
        )
181 1
        return list(regex.search(track_line.strip()).groups())
182
183
    # PARSE tracks info multiline
184
    @classmethod
185
    def parse_hhmmss_string(cls, tracks):
186
        """Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_formated time].\n
187
        :param str tracks:
188
        :return:
189
        """
190
        return [_ for _ in cls._parse_string(tracks)]
191
192
    @classmethod
193
    def _parse_string(cls, tracks):
194
        """
195
        :param str tracks: a '\n' separable string of lines coresponding to the tracks information
196
        :return:
197
        """
198
        # regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$')
199
        for i, line in enumerate(_.strip() for _ in tracks.split('\n')):
200
            if line == '':
201
                continue
202
            try:
203
                yield cls._parse_track_line(line)
204
            except AttributeError as e:
205
                print(
206
                    "Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(
207
                        i + 1, line
208
                    )
209
                )
210
                raise e
211
212
    # CONVERT durations to timestamps tuples (segmentation start-end pair)
213
    @classmethod
214
    def convert_to_timestamps(cls, tracks_row_strings):
215
        """Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format)
216
        to a list of strings with each track's starting timestamp in hhmmss format.\n
217
        :param str tracks_row_strings:
218
        :return: the list of each track's timestamp
219
        :rtype: list
220
        """
221
        lines = cls.parse_hhmmss_string(tracks_row_strings)  # list of lists
222
        i = 1
223
        timestamps = ['0:00']
224
        while i < len(lines):
225
            timestamps.append(cls.add(timestamps[i - 1], lines[i - 1][-1]))
226
            i += 1
227
        return timestamps
228
229
    @classmethod
230
    def add(cls, timestamp1, duration):
231
        """
232
        :param str timestamp1: hh:mm:ss
233
        :param str duration: hh:mm:ss
234
        :return: hh:mm:ss
235
        :rtype: str
236
        """
237
        return cls.hhmmss_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration))
238
239
    ###########################
240
    @staticmethod
241
    def to_seconds(timestamp):
242
        """Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer"""
243
        return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))])
244
245
    @staticmethod
246
    def hhmmss_format(seconds):
247
        """Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation"""
248
        return time.strftime('%H:%M:%S', time.gmtime(seconds))
249