1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
1 |
|
import os |
4
|
1 |
|
import re |
5
|
1 |
|
import time |
6
|
|
|
|
7
|
|
|
|
8
|
1 |
|
class StringToDictParser: |
9
|
|
|
"""Parses album information out of video title string""" |
10
|
1 |
|
check = re.compile(r'^s([1-9]\d*)$') |
11
|
|
|
|
12
|
1 |
|
def __init__(self, entities, separators): |
13
|
1 |
|
if not all(type(x) == str for x in separators): |
14
|
|
|
raise RuntimeError |
15
|
1 |
|
self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()} |
16
|
1 |
|
self.separators = separators |
17
|
|
|
|
18
|
1 |
|
def __call__(self, *args, **kwargs): |
19
|
1 |
|
title = args[0] |
20
|
1 |
|
design = kwargs['design'] |
21
|
1 |
|
if not all(0 <= len(x) <= len(self.entities) + len(self.separators) and all(type(y) == str for y in x) for x in design): |
22
|
|
|
raise RuntimeError |
23
|
1 |
|
if not all(all(StringToDictParser.check.match(y) for y in x if y.startswith('s')) for x in design): |
24
|
|
|
raise RuntimeError |
25
|
1 |
|
rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design] |
26
|
1 |
|
return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x)) |
27
|
|
|
|
28
|
1 |
|
def _yield_reg_comp(self, kati): |
29
|
1 |
|
for k in kati: |
30
|
1 |
|
if k.startswith('s'): |
31
|
1 |
|
yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1] |
32
|
|
|
else: |
33
|
1 |
|
yield self.entities[k] |
34
|
|
|
|
35
|
1 |
|
class AlbumInfoEntity: |
36
|
1 |
|
def __init__(self, name, reg): |
37
|
1 |
|
self.name = name |
38
|
1 |
|
self.reg = reg |
39
|
|
|
|
40
|
1 |
|
def __str__(self): |
41
|
1 |
|
return self.reg |
42
|
|
|
|
43
|
|
|
|
44
|
1 |
|
class RegexSequence: |
45
|
1 |
|
def __init__(self, data): |
46
|
1 |
|
self._keys = [d.name for d in data if hasattr(d, 'name')] |
47
|
1 |
|
self._regex = r'{}'.format(''.join(str(d) for d in data)) |
48
|
|
|
|
49
|
1 |
|
def search_n_dict(self, string): |
50
|
1 |
|
return dict(_ for _ in zip(self._keys, list(getattr(re.search(self._regex, string), 'groups', lambda: len(self._keys)*[''])())) if _[1]) |
51
|
|
|
|
52
|
|
|
|
53
|
1 |
|
class StringParser: |
54
|
1 |
|
__instance = None |
55
|
|
|
|
56
|
1 |
|
regexes = {'track_number': r'\d{1,2}', |
57
|
|
|
'sep1': r"(?: [\t\ ]* [\.\-\)]+ )? [\t ]*", |
58
|
|
|
'track_word': r"\(?[\w'][\w\-’':]*\)?", |
59
|
|
|
'track_sep': r'[\t\ ,]+', |
60
|
|
|
'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )', |
61
|
|
|
'extension': r'\.mp3', |
62
|
|
|
'hhmmss': r'(?:\d?\d:)*\d?\d'} |
63
|
|
|
|
64
|
|
|
## to parse from youtube video title string |
65
|
1 |
|
sep1 = r'[\t ]*[\-\.][\t ]*' |
66
|
1 |
|
sep2 = r'[\t \-\.]+' |
67
|
1 |
|
year = r'\(?(\d{4})\)?' |
68
|
1 |
|
art = r'([\w ]*\w)' |
69
|
1 |
|
alb = r'([\w ]*\w)' |
70
|
|
|
|
71
|
1 |
|
album_info_parser = StringToDictParser({'artist': art, 'album': alb, 'year': year}, [sep1, sep2]) |
72
|
|
|
|
73
|
1 |
|
def __new__(cls, *args, **kwargs): |
74
|
1 |
|
if not cls.__instance: |
75
|
1 |
|
cls.__instance = super().__new__(cls) |
76
|
1 |
|
cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format(**cls.regexes) |
77
|
1 |
|
return cls.__instance |
78
|
|
|
|
79
|
|
|
## STRING TO DICT |
80
|
1 |
|
@classmethod |
81
|
|
|
def parse_album_info(cls, video_title): |
82
|
|
|
"""Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n |
83
|
|
|
Can parse patters: |
84
|
|
|
- Artist Album Year\n |
85
|
|
|
- Artist Album\n |
86
|
|
|
- Album Year\n |
87
|
|
|
- Album\n |
88
|
|
|
:param str video_title: |
89
|
|
|
:return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'} |
90
|
|
|
:rtype: dict |
91
|
|
|
""" |
92
|
1 |
|
return cls.album_info_parser(video_title, design=[['artist', 's1', 'album', 's2', 'year'], |
93
|
|
|
['artist', 's1', 'album'], |
94
|
|
|
['album', 's2', 'year'], |
95
|
|
|
['album']]) |
96
|
|
|
|
97
|
1 |
|
@classmethod |
98
|
|
|
def parse_track_number_n_name(cls, file_name): |
99
|
|
|
"""Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!""" |
100
|
1 |
|
return dict(zip(['track_number', 'track_name'], list( |
101
|
|
|
re.compile(r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format(**cls.regexes), re.X).search( |
102
|
|
|
os.path.basename(file_name)).groups()))) |
103
|
|
|
# return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups()))) |
104
|
|
|
|
105
|
1 |
|
@classmethod |
106
|
|
|
def duration_data_to_timestamp_data(cls, duration_data): |
107
|
|
|
"""Call this method to transform a list of 2-legnth lists of track_name - duration_hhmmss pairs to the equivalent list of lists but with starting timestamps in hhmmss format inplace of the durations.\n |
108
|
|
|
:param list duration_data: eg: [['Know your enemy', '3:45'], ['Wake up', '4:53'], ['Testify', '4:32']] |
109
|
|
|
:return: eg: [['Know your enemy', '0:00'], ['Wake up', '3:45'], ['Testify', '8:38']] |
110
|
|
|
:rtype: list |
111
|
|
|
""" |
112
|
1 |
|
return [list(_) for _ in cls._gen_timestamp_data(duration_data)] |
113
|
|
|
|
114
|
1 |
|
@staticmethod |
115
|
|
|
def _gen_timestamp_data(duration_data): |
116
|
|
|
""" |
117
|
|
|
:param list of lists duration_data: each inner list has as 1st element a track name and as 2nd the track duration in hh:mm:s format |
118
|
|
|
:return: list of lists with timestamps instead of durations ready to feed for segmentation |
119
|
|
|
:rtype: list |
120
|
|
|
""" |
121
|
1 |
|
i = 1 |
122
|
1 |
|
p = Timestamp('0:00') |
123
|
1 |
|
yield duration_data[0][0], str(p) |
124
|
1 |
|
while i < len(duration_data): |
125
|
1 |
|
try: |
126
|
1 |
|
yield duration_data[i][0], str(p + Timestamp(duration_data[i-1][1])) |
127
|
|
|
except WrongTimestampFormat as e: |
128
|
|
|
raise e |
129
|
1 |
|
p += Timestamp(duration_data[i-1][1]) |
130
|
1 |
|
i += 1 |
131
|
|
|
|
132
|
|
|
# STRING TO LIST |
133
|
1 |
|
@classmethod |
134
|
|
|
def parse_hhmmss_string(cls, tracks): |
135
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_timestamp].\n |
136
|
|
|
:param str tracks: |
137
|
|
|
:return: |
138
|
|
|
""" |
139
|
1 |
|
return [_ for _ in cls._parse_string(tracks)] |
140
|
|
|
|
141
|
1 |
|
@classmethod |
142
|
|
|
def _parse_string(cls, tracks): |
143
|
|
|
""" |
144
|
|
|
:param str tracks: a '\n' separable string of lines coresponding to the tracks information |
145
|
|
|
:return: |
146
|
|
|
""" |
147
|
|
|
# regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$') |
148
|
1 |
|
for i, line in enumerate(_.strip() for _ in tracks.split('\n')): |
149
|
1 |
|
if line == '': |
150
|
1 |
|
continue |
151
|
1 |
|
try: |
152
|
1 |
|
yield cls._parse_track_line(line) |
153
|
|
|
except AttributeError: |
154
|
|
|
raise WrongTimestampFormat("Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(i + 1, line)) |
155
|
|
|
|
156
|
1 |
|
@classmethod |
157
|
|
|
def _parse_track_line(cls, track_line): |
158
|
|
|
"""Parses a string line such as '01. Doteru 3:45'""" |
159
|
|
|
# regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))? # potential track number (eg 01) included is ignored |
160
|
|
|
# ([\w\'\(\) \-’]*[\w)]) # track name |
161
|
|
|
# (?:[\t ]+|[\t ]*[\-\.]+[\t ]*) # separator between name and time |
162
|
|
|
# ((?:\d?\d:)*\d?\d)$ # time in hh:mm:ss format""", re.X) |
163
|
|
|
# regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss)) |
164
|
1 |
|
regex = re.compile(r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format(**cls.regexes), re.X) |
165
|
1 |
|
return list(regex.search(track_line.strip()).groups()) |
166
|
|
|
|
167
|
1 |
|
@classmethod |
168
|
|
|
def convert_to_timestamps(cls, tracks_row_strings): |
169
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format) |
170
|
|
|
to a list of strings with each track's starting timestamp in hhmmss format.\n |
171
|
|
|
:param str tracks_row_strings: |
172
|
|
|
:return: the list of each track's timestamp |
173
|
|
|
:rtype: list |
174
|
|
|
""" |
175
|
1 |
|
lines = cls.parse_hhmmss_string(tracks_row_strings) # list of lists |
176
|
1 |
|
i = 1 |
177
|
1 |
|
timestamps = ['0:00'] |
178
|
1 |
|
while i < len(lines): |
179
|
1 |
|
timestamps.append(cls.add(timestamps[i-1], lines[i-1][-1])) |
180
|
1 |
|
i += 1 |
181
|
1 |
|
return timestamps |
182
|
|
|
|
183
|
1 |
|
@classmethod |
184
|
|
|
def add(cls, timestamp1, duration): |
185
|
|
|
""" |
186
|
|
|
:param str timestamp1: hh:mm:ss |
187
|
|
|
:param str duration: hh:mm:ss |
188
|
|
|
:return: hh:mm:ss |
189
|
|
|
:rtype: str |
190
|
|
|
""" |
191
|
1 |
|
return cls.time_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration)) |
192
|
|
|
|
193
|
1 |
|
@staticmethod |
194
|
|
|
def to_seconds(timestamp): |
195
|
|
|
"""Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer""" |
196
|
1 |
|
return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))]) |
197
|
|
|
|
198
|
1 |
|
@staticmethod |
199
|
|
|
def time_format(seconds): |
200
|
|
|
"""Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation""" |
201
|
1 |
|
return time.strftime('%H:%M:%S', time.gmtime(seconds)) |
202
|
|
|
|
203
|
1 |
|
@classmethod |
204
|
1 |
|
def convert_tracks_data(cls, data, album_file, target_directory=''): |
205
|
|
|
""" |
206
|
|
|
Converts input Nx2 list of lists to Nx3 list of lists. The exception being the last list that has 2 elements\n |
207
|
|
|
The input list's inner lists' elements are 'track_name' and 'starting_timestamp' in hhmmss format.\n |
208
|
|
|
:param list of lists data: each inner list should contain track title (no need for number and without extension) |
209
|
|
|
and starting time stamp in hh:mm:ss format |
210
|
|
|
:param str album_file: the path to the audio file of the entire album to potentially segment |
211
|
|
|
:param str target_directory: path to desired directory path to store the potentially created album |
212
|
|
|
:return: each iner list contains track path and timestamp in seconds |
213
|
|
|
:rtype: list of lists |
214
|
|
|
""" |
215
|
1 |
|
return [list(_) for _ in cls._generate_data(data, album_file, target_directory)] |
216
|
|
|
|
217
|
1 |
|
@classmethod |
218
|
|
|
def _generate_data(cls, data, album_file, target_directory): |
219
|
|
|
""" |
220
|
|
|
Given a data list, with each element representing an album's track (as an inner 2-element list with 1st element the 'track_name' and 2nd a timetamp in hh:mm:ss format (the track starts it's playback at that timestamp in relation with the total album playtime), the path to the alum file at hand and the desired output directory or potentially storing the track files, |
221
|
|
|
generates 3-length tuples with the track_file_path, starting timestamp and ending timestamp. Purpose is for the yielded tripplets to be digested for audio segmentation. The exception being the last tuple yielded that has 2 elements; it naturally misses the ending timestamp.\n |
222
|
|
|
:param list data: |
223
|
|
|
:param str album_file: |
224
|
|
|
:param str target_directory: |
225
|
|
|
:returns: 3-element tuples with track_file_path, starting_timestamp, ending_timestamp |
226
|
|
|
:rtype: tuple |
227
|
|
|
""" |
228
|
1 |
|
cls.__album_file = album_file |
229
|
1 |
|
cls.__target_directory = target_directory |
230
|
1 |
|
cls.__track_index_generator = iter((lambda x: str(x) if 9 < x else '0' + str(x))(_) for _ in range(1, len(data) + 1)) |
231
|
1 |
|
for i in range(len(data)-1): |
232
|
1 |
|
if Timestamp(data[i + 1][1]) <= Timestamp(data[i][1]): |
233
|
1 |
|
raise TrackTimestampsSequenceError( |
234
|
|
|
"Track '{} - {}' starting timestamp '{}' should be 'bigger' than track's '{} - {}'; '{}'".format( |
235
|
|
|
i + 2, data[i + 1][0], data[i + 1][1], |
236
|
|
|
i + 1, data[i][0], data[i][1])) |
237
|
1 |
|
yield ( |
238
|
|
|
cls.__track_file(data[i][0]), |
239
|
|
|
str(int(Timestamp(data[i][1]))), |
240
|
|
|
str(int(Timestamp(data[i + 1][1]))) |
241
|
|
|
) |
242
|
1 |
|
yield ( |
243
|
|
|
cls.__track_file(data[-1][0]), |
244
|
|
|
str(int(Timestamp(data[-1][1]))), |
245
|
|
|
) |
246
|
|
|
|
247
|
1 |
|
@classmethod |
248
|
|
|
def __track_file(cls, track_name): |
249
|
1 |
|
return os.path.join(cls.__target_directory, '{} - {}{}'.format( |
250
|
|
|
next(cls.__track_index_generator), |
251
|
|
|
track_name, |
252
|
|
|
(lambda x: '.' + x.split('.')[-1] if len(x.split('.')) > 1 else '')(cls.__album_file))) |
253
|
|
|
|
254
|
|
|
|
255
|
1 |
|
class Timestamp: |
256
|
1 |
|
instances = {} |
257
|
|
|
|
258
|
1 |
|
@classmethod |
259
|
|
|
def __str(cls, element): |
260
|
1 |
|
if len(element) == 1: |
261
|
1 |
|
return '0{}'.format(int(element)) |
262
|
1 |
|
return element |
263
|
|
|
|
264
|
1 |
|
@classmethod |
265
|
|
|
def __pos(cls, array): |
266
|
1 |
|
i = 0 |
267
|
1 |
|
while i < len(array) and array[i] == 0: |
268
|
|
|
i += 1 |
269
|
1 |
|
return i |
270
|
|
|
|
271
|
1 |
|
def __new__(cls, *args, **kwargs): |
272
|
1 |
|
hhmmss = args[0] |
273
|
1 |
|
m = re.compile(r'^(?:(\d?\d):){0,2}(\d?\d)$').search(hhmmss) |
274
|
1 |
|
if not m: |
275
|
1 |
|
raise WrongTimestampFormat("Timestamp given: '{}'. Please use the 'hh:mm:ss' format.".format(hhmmss)) |
276
|
1 |
|
groups = hhmmss.split(':') |
277
|
1 |
|
if not all([0 <= int(_) <= 60 for _ in groups]): |
278
|
1 |
|
raise WrongTimestampFormat("Timestamp given: '{}'. Please use the 'hh:mm:ss' format.".format(hhmmss)) |
279
|
|
|
|
280
|
1 |
|
ind = cls.__pos(groups) |
281
|
1 |
|
if len(groups) == 1: |
282
|
1 |
|
minlength_string = '{}:{}'.format(0, cls.__str(groups[0])) |
283
|
1 |
|
elif len(groups) - ind - 1 < 2: |
284
|
1 |
|
minlength_string = '{}:{}'.format(int(groups[-2]), cls.__str(groups[-1])) |
285
|
|
|
else: |
286
|
1 |
|
minlength_string = ':'.join([str(int(groups[ind]))] + [y for y in groups[ind + 1:]]) |
287
|
1 |
|
stripped_string = ':'.join((str(int(_)) for _ in minlength_string.split(':'))) |
|
|
|
|
288
|
|
|
|
289
|
1 |
|
if stripped_string in cls.instances: |
290
|
1 |
|
return cls.instances[stripped_string] |
291
|
1 |
|
x = super().__new__(cls) |
292
|
1 |
|
x.__minlength_string = minlength_string |
293
|
1 |
|
x.__stripped_string = stripped_string |
294
|
1 |
|
x._s = sum([60 ** i * int(x) for i, x in enumerate(reversed(groups))]) |
295
|
1 |
|
cls.instances[x.__stripped_string] = x |
296
|
1 |
|
return x |
297
|
|
|
|
298
|
1 |
|
def __init__(self, hhmmss): |
299
|
|
|
pass |
300
|
|
|
|
301
|
1 |
|
@staticmethod |
302
|
|
|
def from_duration(seconds): |
303
|
1 |
|
return Timestamp(time.strftime('%H:%M:%S', time.gmtime(seconds))) |
304
|
|
|
|
305
|
1 |
|
def __int__(self): |
306
|
1 |
|
return self._s |
307
|
|
|
|
308
|
1 |
|
def __repr__(self): |
309
|
|
|
return self.__minlength_string |
310
|
|
|
|
311
|
1 |
|
def __str__(self): |
312
|
1 |
|
return self.__minlength_string |
313
|
|
|
|
314
|
1 |
|
def __hash__(self): |
315
|
1 |
|
return self._s |
316
|
|
|
|
317
|
1 |
|
def __eq__(self, other): |
318
|
1 |
|
return hash(self) == hash(other) |
319
|
|
|
|
320
|
1 |
|
def __lt__(self, other): |
321
|
1 |
|
return int(self) < int(other) |
322
|
|
|
|
323
|
1 |
|
def __le__(self, other): |
324
|
1 |
|
return int(self) <= int(other) |
325
|
|
|
|
326
|
1 |
|
def __gt__(self, other): |
327
|
1 |
|
return int(other) < int(self) |
328
|
|
|
|
329
|
1 |
|
def __ge__(self, other): |
330
|
1 |
|
return int(other) <= int(self) |
331
|
|
|
|
332
|
1 |
|
def __add__(self, other): |
333
|
1 |
|
return Timestamp.from_duration(int(self) + int(other)) |
334
|
|
|
|
335
|
1 |
|
def __sub__(self, other): |
336
|
1 |
|
return Timestamp.from_duration(int(self) - int(other)) |
337
|
|
|
|
338
|
|
|
|
339
|
|
|
class WrongTimestampFormat(Exception): pass |
340
|
|
|
class TrackTimestampsSequenceError(Exception): pass |
341
|
|
|
|