Completed
Push — dev ( 232b11...6092be )
by Konstantinos
04:15 queued 01:32
created

music_album_creation.tracks_parsing   F

Complexity

Total Complexity 67

Size/Duplication

Total Lines 341
Duplicated Lines 0 %

Test Coverage

Coverage 94.61%

Importance

Changes 0
Metric Value
eloc 210
dl 0
loc 341
ccs 158
cts 167
cp 0.9461
rs 3.04
c 0
b 0
f 0
wmc 67

38 Methods

Rating   Name   Duplication   Size   Complexity  
A StringParser.add() 0 9 1
A Timestamp.__str__() 0 2 1
A StringParser._gen_timestamp_data() 0 17 3
A Timestamp.__gt__() 0 2 1
A Timestamp.__ge__() 0 2 1
A StringParser.to_seconds() 0 4 1
A StringParser.parse_track_number_n_name() 0 6 1
A Timestamp.__init__() 0 2 1
A Timestamp.__lt__() 0 2 1
A Timestamp.from_duration() 0 3 1
A StringParser._generate_data() 0 28 5
A StringParser.duration_data_to_timestamp_data() 0 8 1
A Timestamp.__add__() 0 2 1
A Timestamp.__hash__() 0 2 1
A StringParser._parse_string() 0 14 4
A StringParser._parse_track_line() 0 10 1
A Timestamp.__pos() 0 6 3
A StringParser.parse_album_info() 0 16 1
A StringParser.convert_to_timestamps() 0 15 2
A Timestamp.__eq__() 0 2 1
A StringParser.parse_hhmmss_string() 0 7 1
A Timestamp.__le__() 0 2 1
A RegexSequence.search_n_dict() 0 2 2
A Timestamp.__str() 0 5 2
A Timestamp.__repr__() 0 2 1
A StringToDictParser._yield_reg_comp() 0 6 3
B Timestamp.__new__() 0 26 6
A StringParser.convert_tracks_data() 0 13 1
A StringParser.__track_file() 0 6 3
A AlbumInfoEntity.__init__() 0 3 1
A StringParser.time_format() 0 4 1
A AlbumInfoEntity.__str__() 0 2 1
A Timestamp.__int__() 0 2 1
A StringToDictParser.__init__() 0 5 2
A StringParser.__new__() 0 5 2
A RegexSequence.__init__() 0 3 1
A StringToDictParser.__call__() 0 9 5
A Timestamp.__sub__() 0 2 1

How to fix   Complexity   

Complexity

Complex classes like music_album_creation.tracks_parsing often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3 1
import os
4 1
import re
5 1
import time
6
7
8 1
class StringToDictParser:
9
    """Parses album information out of video title string"""
10 1
    check = re.compile(r'^s([1-9]\d*)$')
11
12 1
    def __init__(self, entities, separators):
13 1
        if not all(type(x) == str for x in separators):
14
            raise RuntimeError
15 1
        self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()}
16 1
        self.separators = separators
17
18 1
    def __call__(self, *args, **kwargs):
19 1
        title = args[0]
20 1
        design = kwargs['design']
21 1
        if not all(0 <= len(x) <= len(self.entities) + len(self.separators) and all(type(y) == str for y in x) for x in design):
22
            raise RuntimeError
23 1
        if not all(all(StringToDictParser.check.match(y) for y in x if y.startswith('s')) for x in design):
24
            raise RuntimeError
25 1
        rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design]
26 1
        return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x))
27
28 1
    def _yield_reg_comp(self, kati):
29 1
        for k in kati:
30 1
            if k.startswith('s'):
31 1
                yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1]
32
            else:
33 1
                yield self.entities[k]
34
35 1
class AlbumInfoEntity:
36 1
    def __init__(self, name, reg):
37 1
        self.name = name
38 1
        self.reg = reg
39
40 1
    def __str__(self):
41 1
        return self.reg
42
43
44 1
class RegexSequence:
45 1
    def __init__(self, data):
46 1
        self._keys = [d.name for d in data if hasattr(d, 'name')]
47 1
        self._regex = r'{}'.format(''.join(str(d) for d in data))
48
49 1
    def search_n_dict(self, string):
50 1
        return dict(_ for _ in zip(self._keys, list(getattr(re.search(self._regex, string), 'groups', lambda: len(self._keys)*[''])())) if _[1])
51
52
53 1
class StringParser:
54 1
    __instance = None
55
56 1
    regexes = {'track_number': r'\d{1,2}',
57
               'sep1': r"(?: [\t\ ]* [\.\-\)]+ )? [\t ]*",
58
               'track_word': r"\(?[\w'][\w\-’':]*\)?",
59
               'track_sep': r'[\t\ ,]+',
60
               'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )',
61
               'extension': r'\.mp3',
62
               'hhmmss': r'(?:\d?\d:)*\d?\d'}
63
64
    ## to parse from youtube video title string
65 1
    sep1 = r'[\t ]*[\-\.][\t ]*'
66 1
    sep2 = r'[\t \-\.]+'
67 1
    year = r'\(?(\d{4})\)?'
68 1
    art = r'([\w ]*\w)'
69 1
    alb = r'([\w ]*\w)'
70
71 1
    album_info_parser = StringToDictParser({'artist': art, 'album': alb, 'year': year}, [sep1, sep2])
72
73 1
    def __new__(cls, *args, **kwargs):
74 1
        if not cls.__instance:
75 1
            cls.__instance = super().__new__(cls)
76 1
            cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format(**cls.regexes)
77 1
        return cls.__instance
78
79
    ## STRING TO DICT
80 1
    @classmethod
81
    def parse_album_info(cls, video_title):
82
        """Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n
83
        Can parse patters:
84
         - Artist Album Year\n
85
         - Artist Album\n
86
         - Album Year\n
87
         - Album\n
88
        :param str video_title:
89
        :return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'}
90
        :rtype: dict
91
        """
92 1
        return cls.album_info_parser(video_title, design=[['artist', 's1', 'album', 's2', 'year'],
93
                                                          ['artist', 's1', 'album'],
94
                                                          ['album', 's2', 'year'],
95
                                                          ['album']])
96
97 1
    @classmethod
98
    def parse_track_number_n_name(cls, file_name):
99
        """Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!"""
100 1
        return dict(zip(['track_number', 'track_name'], list(
101
            re.compile(r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format(**cls.regexes), re.X).search(
102
                os.path.basename(file_name)).groups())))
103
        # return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups())))
104
105 1
    @classmethod
106
    def duration_data_to_timestamp_data(cls, duration_data):
107
        """Call this method to transform a list of 2-legnth lists of track_name - duration_hhmmss pairs to the equivalent list of lists but with starting timestamps in hhmmss format inplace of the durations.\n
108
        :param list duration_data: eg: [['Know your enemy', '3:45'], ['Wake up', '4:53'], ['Testify', '4:32']]
109
        :return: eg: [['Know your enemy', '0:00'], ['Wake up', '3:45'], ['Testify', '8:38']]
110
        :rtype: list
111
        """
112 1
        return [list(_) for _ in cls._gen_timestamp_data(duration_data)]
113
114 1
    @staticmethod
115
    def _gen_timestamp_data(duration_data):
116
        """
117
        :param list of lists duration_data: each inner list has as 1st element a track name and as 2nd the track duration in hh:mm:s format
118
        :return: list of lists with timestamps instead of durations ready to feed for segmentation
119
        :rtype: list
120
        """
121 1
        i = 1
122 1
        p = Timestamp('0:00')
123 1
        yield duration_data[0][0], str(p)
124 1
        while i < len(duration_data):
125 1
            try:
126 1
                yield duration_data[i][0], str(p + Timestamp(duration_data[i-1][1]))
127
            except WrongTimestampFormat as e:
128
                raise e
129 1
            p += Timestamp(duration_data[i-1][1])
130 1
            i += 1
131
132
    # STRING TO LIST
133 1
    @classmethod
134
    def parse_hhmmss_string(cls, tracks):
135
        """Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_timestamp].\n
136
        :param str tracks:
137
        :return:
138
        """
139 1
        return [_ for _ in cls._parse_string(tracks)]
140
141 1
    @classmethod
142
    def _parse_string(cls, tracks):
143
        """
144
        :param str tracks: a '\n' separable string of lines coresponding to the tracks information
145
        :return:
146
        """
147
        # regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$')
148 1
        for i, line in enumerate(_.strip() for _ in tracks.split('\n')):
149 1
            if line == '':
150 1
                continue
151 1
            try:
152 1
                yield cls._parse_track_line(line)
153
            except AttributeError:
154
                raise WrongTimestampFormat("Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(i + 1, line))
155
156 1
    @classmethod
157
    def _parse_track_line(cls, track_line):
158
        """Parses a string line such as '01. Doteru 3:45'"""
159
        # regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))?  # potential track number (eg 01) included is ignored
160
        #                             ([\w\'\(\) \-’]*[\w)])                       # track name
161
        #                             (?:[\t ]+|[\t ]*[\-\.]+[\t ]*)            # separator between name and time
162
        #                             ((?:\d?\d:)*\d?\d)$                       # time in hh:mm:ss format""", re.X)
163
        # regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss))
164 1
        regex = re.compile(r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format(**cls.regexes), re.X)
165 1
        return list(regex.search(track_line.strip()).groups())
166
167 1
    @classmethod
168
    def convert_to_timestamps(cls, tracks_row_strings):
169
        """Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format)
170
        to a list of strings with each track's starting timestamp in hhmmss format.\n
171
        :param str tracks_row_strings:
172
        :return: the list of each track's timestamp
173
        :rtype: list
174
        """
175 1
        lines = cls.parse_hhmmss_string(tracks_row_strings)  # list of lists
176 1
        i = 1
177 1
        timestamps = ['0:00']
178 1
        while i < len(lines):
179 1
            timestamps.append(cls.add(timestamps[i-1], lines[i-1][-1]))
180 1
            i += 1
181 1
        return timestamps
182
183 1
    @classmethod
184
    def add(cls, timestamp1, duration):
185
        """
186
        :param str timestamp1: hh:mm:ss
187
        :param str duration: hh:mm:ss
188
        :return: hh:mm:ss
189
        :rtype: str
190
        """
191 1
        return cls.time_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration))
192
193 1
    @staticmethod
194
    def to_seconds(timestamp):
195
        """Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer"""
196 1
        return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))])
197
198 1
    @staticmethod
199
    def time_format(seconds):
200
        """Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation"""
201 1
        return time.strftime('%H:%M:%S', time.gmtime(seconds))
202
203 1
    @classmethod
204 1
    def convert_tracks_data(cls, data, album_file, target_directory=''):
205
        """
206
        Converts input Nx2 list of lists to Nx3 list of lists. The exception being the last list that has 2 elements\n
207
        The input list's inner lists' elements are 'track_name' and 'starting_timestamp' in hhmmss format.\n
208
        :param list of lists data: each inner list should contain track title (no need for number and without extension)
209
        and starting time stamp in hh:mm:ss format
210
        :param str album_file: the path to the audio file of the entire album to potentially segment
211
        :param str target_directory: path to desired directory path to store the potentially created album
212
        :return: each iner list contains track path and timestamp in seconds
213
        :rtype: list of lists
214
        """
215 1
        return [list(_) for _ in cls._generate_data(data, album_file, target_directory)]
216
217 1
    @classmethod
218
    def _generate_data(cls, data, album_file, target_directory):
219
        """
220
        Given a data list, with each element representing an album's track (as an inner 2-element list with 1st element the 'track_name' and 2nd a timetamp in hh:mm:ss format (the track starts it's playback at that timestamp in relation with the total album playtime), the path to the alum file at hand and the desired output directory or potentially storing the track files,
221
        generates 3-length tuples with the track_file_path, starting timestamp and ending timestamp. Purpose is for the yielded tripplets to be digested for audio segmentation. The exception being the last tuple yielded that has 2 elements; it naturally misses the ending timestamp.\n
222
        :param list data:
223
        :param str album_file:
224
        :param str target_directory:
225
        :returns: 3-element tuples with track_file_path, starting_timestamp, ending_timestamp
226
        :rtype: tuple
227
        """
228 1
        cls.__album_file = album_file
229 1
        cls.__target_directory = target_directory
230 1
        cls.__track_index_generator = iter((lambda x: str(x) if 9 < x else '0' + str(x))(_) for _ in range(1, len(data) + 1))
231 1
        for i in range(len(data)-1):
232 1
            if Timestamp(data[i + 1][1]) <= Timestamp(data[i][1]):
233 1
                raise TrackTimestampsSequenceError(
234
                    "Track '{} - {}' starting timestamp '{}' should be 'bigger' than track's '{} - {}'; '{}'".format(
235
                        i + 2, data[i + 1][0], data[i + 1][1],
236
                        i + 1, data[i][0], data[i][1]))
237 1
            yield (
238
                cls.__track_file(data[i][0]),
239
                str(int(Timestamp(data[i][1]))),
240
                str(int(Timestamp(data[i + 1][1])))
241
            )
242 1
        yield (
243
            cls.__track_file(data[-1][0]),
244
            str(int(Timestamp(data[-1][1]))),
245
        )
246
247 1
    @classmethod
248
    def __track_file(cls, track_name):
249 1
        return os.path.join(cls.__target_directory, '{} - {}{}'.format(
250
            next(cls.__track_index_generator),
251
            track_name,
252
            (lambda x: '.' + x.split('.')[-1] if len(x.split('.')) > 1 else '')(cls.__album_file)))
253
254
255 1
class Timestamp:
256 1
    instances = {}
257
258 1
    @classmethod
259
    def __str(cls, element):
260 1
        if len(element) == 1:
261 1
            return '0{}'.format(int(element))
262 1
        return element
263
264 1
    @classmethod
265
    def __pos(cls, array):
266 1
        i = 0
267 1
        while i < len(array) and array[i] == 0:
268
            i += 1
269 1
        return i
270
271 1
    def __new__(cls, *args, **kwargs):
272 1
        hhmmss = args[0]
273 1
        m = re.compile(r'^(?:(\d?\d):){0,2}(\d?\d)$').search(hhmmss)
274 1
        if not m:
275 1
            raise WrongTimestampFormat("Timestamp given: '{}'. Please use the 'hh:mm:ss' format.".format(hhmmss))
276 1
        groups = hhmmss.split(':')
277 1
        if not all([0 <= int(_) <= 60 for _ in groups]):
278 1
            raise WrongTimestampFormat("Timestamp given: '{}'. Please use the 'hh:mm:ss' format.".format(hhmmss))
279
280 1
        ind = cls.__pos(groups)
281 1
        if len(groups) == 1:
282 1
            minlength_string = '{}:{}'.format(0, cls.__str(groups[0]))
283 1
        elif len(groups) - ind - 1 < 2:
284 1
            minlength_string = '{}:{}'.format(int(groups[-2]), cls.__str(groups[-1]))
285
        else:
286 1
            minlength_string = ':'.join([str(int(groups[ind]))] + [y for y in groups[ind + 1:]])
287 1
        stripped_string = ':'.join((str(int(_)) for _ in minlength_string.split(':')))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
288
289 1
        if stripped_string in cls.instances:
290 1
            return cls.instances[stripped_string]
291 1
        x = super().__new__(cls)
292 1
        x.__minlength_string = minlength_string
293 1
        x.__stripped_string = stripped_string
294 1
        x._s = sum([60 ** i * int(x) for i, x in enumerate(reversed(groups))])
295 1
        cls.instances[x.__stripped_string] = x
296 1
        return x
297
298 1
    def __init__(self, hhmmss):
299
        pass
300
301 1
    @staticmethod
302
    def from_duration(seconds):
303 1
        return Timestamp(time.strftime('%H:%M:%S', time.gmtime(seconds)))
304
305 1
    def __int__(self):
306 1
        return self._s
307
308 1
    def __repr__(self):
309
        return self.__minlength_string
310
311 1
    def __str__(self):
312 1
        return self.__minlength_string
313
314 1
    def __hash__(self):
315 1
        return self._s
316
317 1
    def __eq__(self, other):
318 1
        return hash(self) == hash(other)
319
320 1
    def __lt__(self, other):
321 1
        return int(self) < int(other)
322
323 1
    def __le__(self, other):
324 1
        return int(self) <= int(other)
325
326 1
    def __gt__(self, other):
327 1
        return int(other) < int(self)
328
329 1
    def __ge__(self, other):
330 1
        return int(other) <= int(self)
331
332 1
    def __add__(self, other):
333 1
        return Timestamp.from_duration(int(self) + int(other))
334
335 1
    def __sub__(self, other):
336 1
        return Timestamp.from_duration(int(self) - int(other))
337
338
339
class WrongTimestampFormat(Exception): pass
340
class TrackTimestampsSequenceError(Exception): pass
341