|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
1 |
|
import os |
|
4
|
1 |
|
import re |
|
5
|
1 |
|
import time |
|
6
|
|
|
|
|
7
|
|
|
|
|
8
|
1 |
|
class StringToDictParser(object): |
|
9
|
|
|
"""Parses album information out of video title string""" |
|
10
|
1 |
|
|
|
11
|
|
|
check = re.compile(r'^s([1-9]\d*)$') |
|
12
|
1 |
|
|
|
13
|
1 |
|
def __init__(self, entities, separators): |
|
14
|
|
|
if not all(type(x) == str for x in separators): |
|
15
|
1 |
|
raise RuntimeError |
|
16
|
1 |
|
self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()} |
|
17
|
|
|
self.separators = separators |
|
18
|
1 |
|
|
|
19
|
1 |
|
def __call__(self, *args, **kwargs): |
|
20
|
1 |
|
title = args[0] |
|
21
|
1 |
|
design = kwargs['design'] |
|
22
|
|
|
if not all( |
|
23
|
1 |
|
0 <= len(x) <= len(self.entities) + len(self.separators) |
|
24
|
|
|
and all(type(y) == str for y in x) |
|
25
|
1 |
|
for x in design |
|
26
|
1 |
|
): |
|
27
|
|
|
raise RuntimeError |
|
28
|
1 |
|
if not all( |
|
29
|
1 |
|
all(StringToDictParser.check.match(y) for y in x if y.startswith('s')) |
|
30
|
1 |
|
for x in design |
|
31
|
1 |
|
): |
|
32
|
|
|
raise RuntimeError |
|
33
|
1 |
|
rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design] |
|
34
|
|
|
return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x)) |
|
35
|
1 |
|
|
|
36
|
1 |
|
def _yield_reg_comp(self, kati): |
|
37
|
1 |
|
for k in kati: |
|
38
|
1 |
|
if k.startswith('s'): |
|
39
|
|
|
yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1] |
|
40
|
1 |
|
else: |
|
41
|
1 |
|
yield self.entities[k] |
|
42
|
|
|
|
|
43
|
|
|
|
|
44
|
1 |
|
class AlbumInfoEntity(object): |
|
45
|
1 |
|
def __init__(self, name, reg): |
|
46
|
1 |
|
self.name = name |
|
47
|
1 |
|
self.reg = reg |
|
48
|
|
|
|
|
49
|
1 |
|
def __str__(self): |
|
50
|
1 |
|
return self.reg |
|
51
|
|
|
|
|
52
|
|
|
|
|
53
|
1 |
|
class RegexSequence(object): |
|
54
|
1 |
|
def __init__(self, data): |
|
55
|
1 |
|
self._keys = [d.name for d in data if hasattr(d, 'name')] |
|
56
|
|
|
self._regex = r'{}'.format(''.join(str(d) for d in data)) |
|
57
|
|
|
|
|
58
|
|
|
def search_n_dict(self, string): |
|
59
|
|
|
return dict( |
|
60
|
|
|
_ |
|
61
|
|
|
for _ in zip( |
|
62
|
|
|
self._keys, |
|
63
|
|
|
list( |
|
64
|
1 |
|
getattr( |
|
65
|
1 |
|
re.search(self._regex, string), |
|
66
|
1 |
|
'groups', |
|
67
|
1 |
|
lambda: len(self._keys) * [''], |
|
68
|
1 |
|
)() |
|
69
|
|
|
), |
|
70
|
1 |
|
) |
|
71
|
|
|
if _[1] |
|
72
|
1 |
|
) |
|
73
|
1 |
|
|
|
74
|
1 |
|
|
|
75
|
1 |
|
class StringParser(object): |
|
76
|
1 |
|
__instance = None |
|
77
|
|
|
# we take care of compiling the below regexes with the re.X flag |
|
78
|
|
|
# because they contain whitespaces on purpose for better readability |
|
79
|
1 |
|
# VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |
|
80
|
|
|
|
|
81
|
|
|
# r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})" |
|
82
|
|
|
regexes = { |
|
83
|
|
|
'track_number': r'\d{1,2}', # we know this will try to match as many as possible with back-tracking ;-) |
|
84
|
|
|
'sep1': r"(?: [\t\ ]* [\.\-\,)]+ )? [\t ]*", |
|
85
|
|
|
'track_word_first_char': r"[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE]", |
|
86
|
|
|
'track_word_char': r"[\.\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]", |
|
87
|
|
|
# 'track_word': r"\(?[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE][\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]*\)?", |
|
88
|
|
|
'track_sep': r'[\t\ ,]+', |
|
89
|
|
|
'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )', |
|
90
|
|
|
'extension': r'\.mp[34]', |
|
91
|
1 |
|
'hhmmss': r'(?:\d?\d:)*\d?\d', |
|
92
|
|
|
} |
|
93
|
|
|
|
|
94
|
|
|
## to parse from youtube video title string |
|
95
|
|
|
sep1 = r'[\t ]*[\-\.][\t ]*' |
|
96
|
1 |
|
sep2 = r'[\t \-\.]+' |
|
97
|
|
|
year = r'\(?(\d{4})\)?' |
|
98
|
|
|
art = r'([\w ]*\w)' |
|
99
|
1 |
|
alb = r'([\w ]*\w)' |
|
100
|
|
|
|
|
101
|
|
|
album_info_parser = StringToDictParser( |
|
102
|
|
|
{'artist': art, 'album': alb, 'year': year}, [sep1, sep2] |
|
103
|
|
|
) |
|
104
|
|
|
|
|
105
|
1 |
|
def __new__(cls, *args, **kwargs): |
|
106
|
|
|
if not cls.__instance: |
|
107
|
|
|
cls.__instance = super(cls, StringParser).__new__(cls) |
|
108
|
|
|
cls.regexes[ |
|
109
|
|
|
'track_word' |
|
110
|
|
|
] = r'\(?{track_word_first_char}{track_word_char}*\)?'.format(**cls.regexes) |
|
111
|
1 |
|
cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format( |
|
112
|
|
|
**cls.regexes |
|
113
|
1 |
|
) |
|
114
|
|
|
return cls.__instance |
|
115
|
|
|
|
|
116
|
|
|
## STRING TO DICT |
|
117
|
|
|
@classmethod |
|
118
|
|
|
def parse_album_info(cls, video_title): |
|
119
|
|
|
"""Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n |
|
120
|
1 |
|
Can parse patters: |
|
121
|
1 |
|
- Artist Album Year\n |
|
122
|
1 |
|
- Artist Album\n |
|
123
|
1 |
|
- Album Year\n |
|
124
|
1 |
|
- Album\n |
|
125
|
|
|
:param str video_title: |
|
126
|
|
|
:return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'} |
|
127
|
|
|
:rtype: dict |
|
128
|
|
|
""" |
|
129
|
1 |
|
return cls.album_info_parser( |
|
130
|
|
|
video_title, |
|
131
|
|
|
design=[ |
|
132
|
|
|
['artist', 's1', 'album', 's2', 'year'], |
|
133
|
|
|
['artist', 's1', 'album'], |
|
134
|
|
|
['album', 's2', 'year'], |
|
135
|
|
|
['album'], |
|
136
|
|
|
], |
|
137
|
|
|
) |
|
138
|
|
|
|
|
139
|
|
|
# Uses the cls.regexes |
|
140
|
|
|
# PARSE filenames |
|
141
|
|
|
@classmethod |
|
142
|
1 |
|
def parse_track_number_n_name(cls, file_name): |
|
143
|
1 |
|
"""Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!""" |
|
144
|
|
|
return dict( |
|
145
|
|
|
zip( |
|
146
|
1 |
|
['track_number', 'track_name'], |
|
147
|
|
|
list( |
|
148
|
|
|
re.compile( |
|
149
|
|
|
r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format( |
|
150
|
|
|
**cls.regexes |
|
151
|
|
|
), |
|
152
|
|
|
re.X, # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |
|
153
|
|
|
) |
|
154
|
1 |
|
.search(os.path.basename(file_name)) |
|
155
|
1 |
|
.groups() |
|
156
|
1 |
|
), |
|
157
|
1 |
|
) |
|
158
|
1 |
|
) |
|
159
|
1 |
|
# return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups()))) |
|
160
|
1 |
|
|
|
161
|
|
|
# Uses the cls.regexes |
|
162
|
1 |
|
@classmethod |
|
163
|
|
|
def _parse_track_line(cls, track_line): |
|
164
|
|
|
""" |
|
165
|
|
|
Parses a string line such as '01. Doteru 3:45' into ['Doteru', '3:45']\n |
|
166
|
|
|
:param track_line: |
|
167
|
|
|
:return: the parsed items |
|
168
|
|
|
:rtype: list |
|
169
|
|
|
""" |
|
170
|
1 |
|
# regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))? # potential track number (eg 01) included is ignored |
|
171
|
|
|
# ([\w\'\(\) \-’]*[\w)]) # track name |
|
172
|
|
|
# (?:[\t ]+|[\t ]*[\-\.]+[\t ]*) # separator between name and time |
|
173
|
1 |
|
# ((?:\d?\d:)*\d?\d)$ # time in hh:mm:ss format""", re.X) |
|
174
|
|
|
# regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss)) |
|
175
|
|
|
regex = re.compile( |
|
176
|
1 |
|
r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format( |
|
177
|
|
|
**cls.regexes |
|
178
|
1 |
|
), |
|
179
|
|
|
re.X, |
|
180
|
|
|
) |
|
181
|
1 |
|
return list(regex.search(track_line.strip()).groups()) |
|
182
|
|
|
|
|
183
|
|
|
# PARSE tracks info multiline |
|
184
|
|
|
@classmethod |
|
185
|
|
|
def parse_hhmmss_string(cls, tracks): |
|
186
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_formated time].\n |
|
187
|
|
|
:param str tracks: |
|
188
|
|
|
:return: |
|
189
|
|
|
""" |
|
190
|
|
|
return [_ for _ in cls._parse_string(tracks)] |
|
191
|
|
|
|
|
192
|
|
|
@classmethod |
|
193
|
|
|
def _parse_string(cls, tracks): |
|
194
|
|
|
""" |
|
195
|
|
|
:param str tracks: a '\n' separable string of lines coresponding to the tracks information |
|
196
|
|
|
:return: |
|
197
|
|
|
""" |
|
198
|
|
|
# regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$') |
|
199
|
|
|
for i, line in enumerate(_.strip() for _ in tracks.split('\n')): |
|
200
|
|
|
if line == '': |
|
201
|
|
|
continue |
|
202
|
|
|
try: |
|
203
|
|
|
yield cls._parse_track_line(line) |
|
204
|
|
|
except AttributeError as e: |
|
205
|
|
|
print( |
|
206
|
|
|
"Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format( |
|
207
|
|
|
i + 1, line |
|
208
|
|
|
) |
|
209
|
|
|
) |
|
210
|
|
|
raise e |
|
211
|
|
|
|
|
212
|
|
|
# CONVERT durations to timestamps tuples (segmentation start-end pair) |
|
213
|
|
|
@classmethod |
|
214
|
|
|
def convert_to_timestamps(cls, tracks_row_strings): |
|
215
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format) |
|
216
|
|
|
to a list of strings with each track's starting timestamp in hhmmss format.\n |
|
217
|
|
|
:param str tracks_row_strings: |
|
218
|
|
|
:return: the list of each track's timestamp |
|
219
|
|
|
:rtype: list |
|
220
|
|
|
""" |
|
221
|
|
|
lines = cls.parse_hhmmss_string(tracks_row_strings) # list of lists |
|
222
|
|
|
i = 1 |
|
223
|
|
|
timestamps = ['0:00'] |
|
224
|
|
|
while i < len(lines): |
|
225
|
|
|
timestamps.append(cls.add(timestamps[i - 1], lines[i - 1][-1])) |
|
226
|
|
|
i += 1 |
|
227
|
|
|
return timestamps |
|
228
|
|
|
|
|
229
|
|
|
@classmethod |
|
230
|
|
|
def add(cls, timestamp1, duration): |
|
231
|
|
|
""" |
|
232
|
|
|
:param str timestamp1: hh:mm:ss |
|
233
|
|
|
:param str duration: hh:mm:ss |
|
234
|
|
|
:return: hh:mm:ss |
|
235
|
|
|
:rtype: str |
|
236
|
|
|
""" |
|
237
|
|
|
return cls.hhmmss_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration)) |
|
238
|
|
|
|
|
239
|
|
|
########################### |
|
240
|
|
|
@staticmethod |
|
241
|
|
|
def to_seconds(timestamp): |
|
242
|
|
|
"""Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer""" |
|
243
|
|
|
return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))]) |
|
244
|
|
|
|
|
245
|
|
|
@staticmethod |
|
246
|
|
|
def hhmmss_format(seconds): |
|
247
|
|
|
"""Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation""" |
|
248
|
|
|
return time.strftime('%H:%M:%S', time.gmtime(seconds)) |
|
249
|
|
|
|