1
|
|
|
import os |
2
|
|
|
import re |
3
|
|
|
import time |
4
|
|
|
|
5
|
|
|
|
6
|
|
|
class StringToDictParser: |
7
|
|
|
"""Parses album information out of video title string""" |
8
|
|
|
check = re.compile(r'^s([1-9]\d*)$') |
9
|
|
|
|
10
|
|
|
def __init__(self, entities, separators): |
11
|
|
|
assert all(type(x) == str for x in separators) |
12
|
|
|
self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()} |
13
|
|
|
self.separators = separators |
14
|
|
|
|
15
|
|
|
def __call__(self, *args, **kwargs): |
16
|
|
|
title = args[0] |
17
|
|
|
design = kwargs['design'] |
18
|
|
|
assert all(0 <= len(x) <= len(self.entities) + len(self.separators) and all(type(y) == str for y in x) for x in design) |
19
|
|
|
assert all(all(StringToDictParser.check.match(y) for y in x if y.startswith('s')) for x in design) |
20
|
|
|
rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design] |
21
|
|
|
return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x)) |
22
|
|
|
|
23
|
|
|
def _yield_reg_comp(self, kati): |
24
|
|
|
for k in kati: |
25
|
|
|
if k.startswith('s'): |
26
|
|
|
yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1] |
27
|
|
|
else: |
28
|
|
|
yield self.entities[k] |
29
|
|
|
|
30
|
|
|
class AlbumInfoEntity: |
31
|
|
|
def __init__(self, name, reg): |
32
|
|
|
self.name = name |
33
|
|
|
self.reg = reg |
34
|
|
|
|
35
|
|
|
def __str__(self): |
36
|
|
|
return self.reg |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
class RegexSequence: |
40
|
|
|
def __init__(self, data): |
41
|
|
|
self._keys = [d.name for d in data if hasattr(d, 'name')] |
42
|
|
|
self._regex = r'{}'.format(''.join(str(d) for d in data)) |
43
|
|
|
|
44
|
|
|
def search_n_dict(self, string): |
45
|
|
|
return dict(_ for _ in zip(self._keys, list(getattr(re.search(self._regex, string), 'groups', lambda: ['', '', ''])())) if _[1]) |
46
|
|
|
|
47
|
|
|
def __str__(self): |
48
|
|
|
return self._regex |
49
|
|
|
|
50
|
|
|
|
51
|
|
|
class StringParser: |
52
|
|
|
__instance = None |
53
|
|
|
|
54
|
|
|
track_number = r'\d{1,2}' |
55
|
|
|
track_name = r'[\w\'\(\) \-’]*[\w)]' |
56
|
|
|
sep = r'(?:[\t ]+|[\t ]*[\.\-,]+[\t ]*)' |
57
|
|
|
extension = r'\.mp3' |
58
|
|
|
hhmmss = r'(?:\d?\d:)*\d?\d' |
59
|
|
|
|
60
|
|
|
## to parse from youtube video title string |
61
|
|
|
sep1 = r'[\t ]*[\-\.][\t ]*' |
62
|
|
|
sep2 = r'[\t \-\.]+' |
63
|
|
|
year = r'\(?(\d{4})\)?' |
64
|
|
|
art = r'([\w ]*\w)' |
65
|
|
|
alb = r'([\w ]*\w)' |
66
|
|
|
|
67
|
|
|
album_info_parser = StringToDictParser({'artist': art, 'album': alb, 'year': year}, [sep1, sep2]) |
68
|
|
|
|
69
|
|
|
def __new__(cls, *args, **kwargs): |
70
|
|
|
if not cls.__instance: |
71
|
|
|
cls.__instance = super().__new__(cls) |
72
|
|
|
return cls.__instance |
73
|
|
|
|
74
|
|
|
@classmethod |
75
|
|
|
def parse_track_number_n_name(cls, file_name): |
76
|
|
|
"""Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!""" |
77
|
|
|
return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups()))) |
78
|
|
|
|
79
|
|
|
@classmethod |
80
|
|
|
def duration_data_to_timestamp_data(cls, duration_data): |
81
|
|
|
"""Call this to transform data concerning tracks' starting timestamps to tracks' time duration. In both cases the format is hh:mm:ss""" |
82
|
|
|
return [list(_) for _ in cls._gen_timestamp_data(duration_data)] |
83
|
|
|
|
84
|
|
|
@staticmethod |
85
|
|
|
def _gen_timestamp_data(duration_data): |
86
|
|
|
""" |
87
|
|
|
:param list of lists duration_data: each inner list has as 1st element a track name and as 2nd the track duration in hh:mm:s format |
88
|
|
|
:return: list of lists with timestamps instead of durations ready to feed for segmentation |
89
|
|
|
:rtype: list |
90
|
|
|
""" |
91
|
|
|
i = 1 |
92
|
|
|
p = Timestamp('0:00') |
93
|
|
|
yield duration_data[0][0], str(p) |
94
|
|
|
while i < len(duration_data): |
95
|
|
|
yield duration_data[i][0], str(p + Timestamp(duration_data[i-1][1])) |
96
|
|
|
p += Timestamp(duration_data[i-1][1]) |
97
|
|
|
i += 1 |
98
|
|
|
|
99
|
|
|
@classmethod |
100
|
|
|
def parse_hhmmss_string(cls, tracks): |
101
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_timestamp].\n |
102
|
|
|
:param str tracks: |
103
|
|
|
:return: |
104
|
|
|
""" |
105
|
|
|
return [_ for _ in cls._parse_string(tracks)] |
106
|
|
|
|
107
|
|
|
@classmethod |
108
|
|
|
def _parse_string(cls, tracks): |
109
|
|
|
""" |
110
|
|
|
:param str tracks: a '\n' separable string of lines coresponding to the tracks information |
111
|
|
|
:return: |
112
|
|
|
""" |
113
|
|
|
# regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$') |
114
|
|
|
for i, line in enumerate(_.strip() for _ in tracks.split('\n')): |
115
|
|
|
if line == '': |
116
|
|
|
continue |
117
|
|
|
try: |
118
|
|
|
yield cls._parse_track_line(line) |
119
|
|
|
except AttributeError: |
120
|
|
|
raise WrongTimestampFormat("Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(i + 1, line)) |
121
|
|
|
|
122
|
|
|
@classmethod |
123
|
|
|
def _parse_track_line(cls, track_line): |
124
|
|
|
"""Parses a string line such as '01. Doteru 3:45'""" |
125
|
|
|
# regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))? # potential track number (eg 01) included is ignored |
126
|
|
|
# ([\w\'\(\) \-’]*[\w)]) # track name |
127
|
|
|
# (?:[\t ]+|[\t ]*[\-\.]+[\t ]*) # separator between name and time |
128
|
|
|
# ((?:\d?\d:)*\d?\d)$ # time in hh:mm:ss format""", re.X) |
129
|
|
|
regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.sep, cls.track_name, cls.sep, cls.hhmmss)) |
130
|
|
|
return list(regex.search(track_line.strip()).groups()) |
131
|
|
|
|
132
|
|
|
@classmethod |
133
|
|
|
def get_instance(cls): |
134
|
|
|
return StringParser() |
135
|
|
|
|
136
|
|
|
@classmethod |
137
|
|
|
def parse_tracks_hhmmss(cls, tracks_row_strings): |
138
|
|
|
""" |
139
|
|
|
Call this method to transform a |
140
|
|
|
Returns parsed tracks: track_title and timestamp in hh:mm:ss format given the multiline string. Ignores potentially |
141
|
|
|
found track numbers in the start of each line Returs a list of lists. Each inner list holds the captured groups in the parenthesis'\n |
142
|
|
|
:param str tracks_row_strings: |
143
|
|
|
:return: a list of lists with each inner list corresponding to each input string row and having 2 elements: the track name and the timestamp |
144
|
|
|
:rtype: list |
145
|
|
|
""" |
146
|
|
|
return cls.parse_hhmmss_string(tracks_row_strings) |
147
|
|
|
|
148
|
|
|
@classmethod |
149
|
|
|
def hhmmss_durations_to_timestamps(cls, hhmmss_list): |
150
|
|
|
return [_ for _ in cls._generate_timestamps(hhmmss_list)] |
151
|
|
|
|
152
|
|
|
@classmethod |
153
|
|
|
def _generate_timestamps(cls, hhmmss_list): |
154
|
|
|
p = '0:00' |
155
|
|
|
yield p |
156
|
|
|
for el in hhmmss_list[:-1]: |
157
|
|
|
_ = cls.add(p, el) |
158
|
|
|
yield _ |
159
|
|
|
p = _ |
160
|
|
|
|
161
|
|
|
@classmethod |
162
|
|
|
def convert_to_timestamps(cls, tracks_row_strings): |
163
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format) |
164
|
|
|
to a list of strings with each track's starting timestamp in hhmmss format.\n |
165
|
|
|
:param str tracks_row_strings: |
166
|
|
|
:return: the list of each track's timestamp |
167
|
|
|
:rtype: list |
168
|
|
|
""" |
169
|
|
|
lines = cls.parse_tracks_hhmmss(tracks_row_strings) # list of lists |
170
|
|
|
i = 1 |
171
|
|
|
timestamps = ['0:00'] |
172
|
|
|
while i < len(lines): |
173
|
|
|
timestamps.append(cls.add(timestamps[i-1], lines[i-1][-1])) |
174
|
|
|
i += 1 |
175
|
|
|
return timestamps |
176
|
|
|
|
177
|
|
|
@classmethod |
178
|
|
|
def add(cls, timestamp1: str, duration: str) -> object: |
179
|
|
|
""" |
180
|
|
|
:param str timestamp1: hh:mm:ss |
181
|
|
|
:param str duration: hh:mm:ss |
182
|
|
|
:return: hh:mm:ss |
183
|
|
|
:rtype: str |
184
|
|
|
""" |
185
|
|
|
return cls.time_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration)) |
186
|
|
|
|
187
|
|
|
@staticmethod |
188
|
|
|
def to_seconds(timestamp): |
189
|
|
|
"""Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer""" |
190
|
|
|
return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))]) |
191
|
|
|
|
192
|
|
|
@staticmethod |
193
|
|
|
def time_format(seconds): |
194
|
|
|
"""Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation""" |
195
|
|
|
return time.strftime('%H:%M:%S', time.gmtime(seconds)) |
196
|
|
|
|
197
|
|
|
@classmethod |
198
|
|
|
def parse_album_info(cls, video_title): |
199
|
|
|
"""Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n |
200
|
|
|
Can parse patters: |
201
|
|
|
- Artist Album Year\n |
202
|
|
|
- Artist Album\n |
203
|
|
|
- Album Year\n |
204
|
|
|
- Album\n |
205
|
|
|
:param str video_title: |
206
|
|
|
:return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'} |
207
|
|
|
:rtype: dict |
208
|
|
|
""" |
209
|
|
|
return cls.album_info_parser(video_title, design=[['artist', 's1', 'album', 's2', 'year'], |
210
|
|
|
['artist', 's1', 'album'], |
211
|
|
|
['album', 's2', 'year'], |
212
|
|
|
['album']]) |
213
|
|
|
|
214
|
|
|
@classmethod |
215
|
|
|
def convert_tracks_data(cls, data, album_file, target_directory=''): |
216
|
|
|
""" |
217
|
|
|
Converts input Nx2 list of lists to Nx3 list of lists. The exception being the last list that has 2 elements\n |
218
|
|
|
:param list of lists data: each inner list should contain track title (no need for number and without extension) |
219
|
|
|
and starting time stamp in hh:mm:ss format |
220
|
|
|
:param str album_file: the path to the audio file of the entire album to potentially segment |
221
|
|
|
:param str target_directory: path to desired directory path to store the potentially created album |
222
|
|
|
:return: each iner list contains track title and timestamp in seconds |
223
|
|
|
:rtype: list of lists |
224
|
|
|
""" |
225
|
|
|
return [list(_) for _ in cls._generate_data(data, album_file, target_directory)] |
226
|
|
|
|
227
|
|
|
@classmethod |
228
|
|
|
def _generate_data(cls, data, album_file, target_directory): |
229
|
|
|
""" |
230
|
|
|
Given a data list, with each element representing an album's track (as an inner 2-element list with 1st element the 'track_name' and 2nd a timetamp in hh:mm:ss format (the track starts it's playback at that timestamp in relation with the total album playtime), the path to the alum file at hand and the desired output directory or potentially storing the track files, |
231
|
|
|
generates 3-length tuples with the track_file_path, starting timestamp and ending timestamp. Purpose is for the yielded tripplets to be digested for audio segmentation. The exception being the last tuple yielded that has 2 elements; it naturally misses the ending timestamp.\n |
232
|
|
|
:param list data: |
233
|
|
|
:param str album_file: |
234
|
|
|
:param str target_directory: |
235
|
|
|
:returns: 3-element tuples with track_name, starting_timestamp, ending_timestamp |
236
|
|
|
:rtype: tuple |
237
|
|
|
""" |
238
|
|
|
cls.__album_file = album_file |
239
|
|
|
cls.__target_directory = target_directory |
240
|
|
|
cls.__track_index_generator = iter((lambda x: str(x) if 9 < x else '0' + str(x))(_) for _ in range(1, len(data) + 1)) |
241
|
|
|
for i in range(len(data)-1): |
242
|
|
|
if Timestamp(data[i + 1][1]) <= Timestamp(data[i][1]): |
243
|
|
|
raise TrackTimestampsSequenceError( |
244
|
|
|
"Track '{} - {}' starting timestamp '{}' should be 'bigger' than track's '{} - {}'; '{}'".format( |
245
|
|
|
i + 2, data[i + 1][0], data[i + 1][1], |
246
|
|
|
i + 1, data[i][0], data[i][1])) |
247
|
|
|
yield ( |
248
|
|
|
cls.__track_file(data[i][0]), |
249
|
|
|
str(int(Timestamp(data[i][1]))), |
250
|
|
|
str(int(Timestamp(data[i + 1][1]))) |
251
|
|
|
) |
252
|
|
|
yield ( |
253
|
|
|
cls.__track_file(data[-1][0]), |
254
|
|
|
str(int(Timestamp(data[-1][1]))), |
255
|
|
|
) |
256
|
|
|
|
257
|
|
|
@classmethod |
258
|
|
|
def __track_file(cls, track_name): |
259
|
|
|
return os.path.join(cls.__target_directory, '{} - {}{}'.format( |
260
|
|
|
next(cls.__track_index_generator), |
261
|
|
|
track_name, |
262
|
|
|
(lambda x: '.' + x.split('.')[-1] if len(x.split('.')) > 1 else '')(cls.__album_file))) |
263
|
|
|
|
264
|
|
|
|
265
|
|
|
class Timestamp: |
266
|
|
|
instances = {} |
267
|
|
|
|
268
|
|
|
def __new__(cls, *args, **kwargs): |
269
|
|
|
hhmmss = args[0] |
270
|
|
|
if hhmmss in cls.instances: |
271
|
|
|
return cls.instances[hhmmss] |
272
|
|
|
match = re.fullmatch(r'((\d?\d):){0,2}(\d?\d)', hhmmss) |
273
|
|
|
if not match: |
274
|
|
|
raise WrongTimestampFormat("Timestamp given: '{}'. Please use the 'hh:mm:ss' format.".format(hhmmss)) |
275
|
|
|
values = [int(_) for _ in hhmmss.split(':')] |
276
|
|
|
if not all([0 <= _ <= 60 for _ in values]): |
277
|
|
|
raise WrongTimestampFormat("Timestamp given: '{}'. Please use the 'hh:mm:ss' format.".format(hhmmss)) |
278
|
|
|
x = super().__new__(cls) |
279
|
|
|
x._s = sum([60 ** i * int(x) for i, x in enumerate(reversed(values))]) |
280
|
|
|
x._b = hhmmss |
281
|
|
|
cls.instances[hhmmss] = x |
282
|
|
|
return x |
283
|
|
|
|
284
|
|
|
def __init__(self, hhmmss): |
285
|
|
|
pass |
286
|
|
|
|
287
|
|
|
@staticmethod |
288
|
|
|
def from_duration(seconds): |
289
|
|
|
return Timestamp(time.strftime('%H:%M:%S', time.gmtime(seconds))) |
290
|
|
|
|
291
|
|
|
def __repr__(self): |
292
|
|
|
return self._b |
293
|
|
|
|
294
|
|
|
def __str__(self): |
295
|
|
|
return self._b |
296
|
|
|
|
297
|
|
|
def __eq__(self, other): |
298
|
|
|
return str(self) == str(other) |
299
|
|
|
|
300
|
|
|
def __int__(self): |
301
|
|
|
return self._s |
302
|
|
|
|
303
|
|
|
def __lt__(self, other): |
304
|
|
|
return int(self) < int(other) |
305
|
|
|
|
306
|
|
|
def __le__(self, other): |
307
|
|
|
return int(self) <= int(other) |
308
|
|
|
|
309
|
|
|
def __gt__(self, other): |
310
|
|
|
return int(other) < int(self) |
311
|
|
|
|
312
|
|
|
def __ge__(self, other): |
313
|
|
|
return int(other) <= int(self) |
314
|
|
|
|
315
|
|
|
def __add__(self, other): |
316
|
|
|
return Timestamp.from_duration(int(self) + int(other)) |
317
|
|
|
|
318
|
|
|
def __sub__(self, other): |
319
|
|
|
return Timestamp.from_duration(int(self) - int(other)) |
320
|
|
|
|
321
|
|
|
|
322
|
|
|
class WrongTimestampFormat(Exception): pass |
323
|
|
|
class TrackTimestampsSequenceError(Exception): pass |
324
|
|
|
|