1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
1 |
|
import os |
4
|
1 |
|
import re |
5
|
1 |
|
import time |
6
|
|
|
|
7
|
|
|
|
8
|
1 |
|
class StringToDictParser(object): |
9
|
|
|
"""Parses album information out of video title string""" |
10
|
1 |
|
|
11
|
|
|
check = re.compile(r'^s([1-9]\d*)$') |
12
|
1 |
|
|
13
|
1 |
|
def __init__(self, entities, separators): |
14
|
|
|
if not all(type(x) == str for x in separators): |
15
|
1 |
|
raise RuntimeError |
16
|
1 |
|
self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()} |
17
|
|
|
self.separators = separators |
18
|
1 |
|
|
19
|
1 |
|
def __call__(self, *args, **kwargs): |
20
|
1 |
|
title = args[0] |
21
|
1 |
|
design = kwargs['design'] |
22
|
|
|
if not all( |
23
|
1 |
|
0 <= len(x) <= len(self.entities) + len(self.separators) |
24
|
|
|
and all(type(y) == str for y in x) |
25
|
1 |
|
for x in design |
26
|
1 |
|
): |
27
|
|
|
raise RuntimeError |
28
|
1 |
|
if not all( |
29
|
1 |
|
all(StringToDictParser.check.match(y) for y in x if y.startswith('s')) |
30
|
1 |
|
for x in design |
31
|
1 |
|
): |
32
|
|
|
raise RuntimeError |
33
|
1 |
|
rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design] |
34
|
|
|
return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x)) |
35
|
1 |
|
|
36
|
1 |
|
def _yield_reg_comp(self, kati): |
37
|
1 |
|
for k in kati: |
38
|
1 |
|
if k.startswith('s'): |
39
|
|
|
yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1] |
40
|
1 |
|
else: |
41
|
1 |
|
yield self.entities[k] |
42
|
|
|
|
43
|
|
|
|
44
|
1 |
|
class AlbumInfoEntity(object): |
45
|
1 |
|
def __init__(self, name, reg): |
46
|
1 |
|
self.name = name |
47
|
1 |
|
self.reg = reg |
48
|
|
|
|
49
|
1 |
|
def __str__(self): |
50
|
1 |
|
return self.reg |
51
|
|
|
|
52
|
|
|
|
53
|
1 |
|
class RegexSequence(object): |
54
|
1 |
|
def __init__(self, data): |
55
|
1 |
|
self._keys = [d.name for d in data if hasattr(d, 'name')] |
56
|
|
|
self._regex = r'{}'.format(''.join(str(d) for d in data)) |
57
|
|
|
|
58
|
|
|
def search_n_dict(self, string): |
59
|
|
|
return dict( |
60
|
|
|
_ |
61
|
|
|
for _ in zip( |
62
|
|
|
self._keys, |
63
|
|
|
list( |
64
|
1 |
|
getattr( |
65
|
1 |
|
re.search(self._regex, string), |
66
|
1 |
|
'groups', |
67
|
1 |
|
lambda: len(self._keys) * [''], |
68
|
1 |
|
)() |
69
|
|
|
), |
70
|
1 |
|
) |
71
|
|
|
if _[1] |
72
|
1 |
|
) |
73
|
1 |
|
|
74
|
1 |
|
|
75
|
1 |
|
class StringParser(object): |
76
|
1 |
|
__instance = None |
77
|
|
|
# we take care of compiling the below regexes with the re.X flag |
78
|
|
|
# because they contain whitespaces on purpose for better readability |
79
|
1 |
|
# VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |
80
|
|
|
|
81
|
|
|
# r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})" |
82
|
|
|
regexes = { |
83
|
|
|
'track_number': r'\d{1,2}', # we know this will try to match as many as possible with back-tracking ;-) |
84
|
|
|
'sep1': r"(?: [\t\ ]* [\.\-\,)]+ )? [\t ]*", |
85
|
|
|
'track_word_first_char': r"[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE]", |
86
|
|
|
'track_word_char': r"[\.\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]", |
87
|
|
|
# 'track_word': r"\(?[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE][\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]*\)?", |
88
|
|
|
'track_sep': r'[\t\ ,]+', |
89
|
|
|
'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )', |
90
|
|
|
'extension': r'\.mp[34]', |
91
|
1 |
|
'hhmmss': r'(?:\d?\d:)*\d?\d', |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
## to parse from youtube video title string |
95
|
|
|
sep1 = r'[\t ]*[\-\.][\t ]*' |
96
|
1 |
|
sep2 = r'[\t \-\.]+' |
97
|
|
|
year = r'\(?(\d{4})\)?' |
98
|
|
|
art = r'([\w ]*\w)' |
99
|
1 |
|
alb = r'([\w ]*\w)' |
100
|
|
|
|
101
|
|
|
album_info_parser = StringToDictParser( |
102
|
|
|
{'artist': art, 'album': alb, 'year': year}, [sep1, sep2] |
103
|
|
|
) |
104
|
|
|
|
105
|
1 |
|
def __new__(cls, *args, **kwargs): |
106
|
|
|
if not cls.__instance: |
107
|
|
|
cls.__instance = super(cls, StringParser).__new__(cls) |
108
|
|
|
cls.regexes[ |
109
|
|
|
'track_word' |
110
|
|
|
] = r'\(?{track_word_first_char}{track_word_char}*\)?'.format(**cls.regexes) |
111
|
1 |
|
cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format( |
112
|
|
|
**cls.regexes |
113
|
1 |
|
) |
114
|
|
|
return cls.__instance |
115
|
|
|
|
116
|
|
|
## STRING TO DICT |
117
|
|
|
@classmethod |
118
|
|
|
def parse_album_info(cls, video_title): |
119
|
|
|
"""Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n |
120
|
1 |
|
Can parse patters: |
121
|
1 |
|
- Artist Album Year\n |
122
|
1 |
|
- Artist Album\n |
123
|
1 |
|
- Album Year\n |
124
|
1 |
|
- Album\n |
125
|
|
|
:param str video_title: |
126
|
|
|
:return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'} |
127
|
|
|
:rtype: dict |
128
|
|
|
""" |
129
|
1 |
|
return cls.album_info_parser( |
130
|
|
|
video_title, |
131
|
|
|
design=[ |
132
|
|
|
['artist', 's1', 'album', 's2', 'year'], |
133
|
|
|
['artist', 's1', 'album'], |
134
|
|
|
['album', 's2', 'year'], |
135
|
|
|
['album'], |
136
|
|
|
], |
137
|
|
|
) |
138
|
|
|
|
139
|
|
|
# Uses the cls.regexes |
140
|
|
|
# PARSE filenames |
141
|
|
|
@classmethod |
142
|
1 |
|
def parse_track_number_n_name(cls, file_name): |
143
|
1 |
|
"""Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!""" |
144
|
|
|
return dict( |
145
|
|
|
zip( |
146
|
1 |
|
['track_number', 'track_name'], |
147
|
|
|
list( |
148
|
|
|
re.compile( |
149
|
|
|
r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format( |
150
|
|
|
**cls.regexes |
151
|
|
|
), |
152
|
|
|
re.X, # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |
153
|
|
|
) |
154
|
1 |
|
.search(os.path.basename(file_name)) |
155
|
1 |
|
.groups() |
156
|
1 |
|
), |
157
|
1 |
|
) |
158
|
1 |
|
) |
159
|
1 |
|
# return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups()))) |
160
|
1 |
|
|
161
|
|
|
# Uses the cls.regexes |
162
|
1 |
|
@classmethod |
163
|
|
|
def _parse_track_line(cls, track_line): |
164
|
|
|
""" |
165
|
|
|
Parses a string line such as '01. Doteru 3:45' into ['Doteru', '3:45']\n |
166
|
|
|
:param track_line: |
167
|
|
|
:return: the parsed items |
168
|
|
|
:rtype: list |
169
|
|
|
""" |
170
|
1 |
|
# regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))? # potential track number (eg 01) included is ignored |
171
|
|
|
# ([\w\'\(\) \-’]*[\w)]) # track name |
172
|
|
|
# (?:[\t ]+|[\t ]*[\-\.]+[\t ]*) # separator between name and time |
173
|
1 |
|
# ((?:\d?\d:)*\d?\d)$ # time in hh:mm:ss format""", re.X) |
174
|
|
|
# regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss)) |
175
|
|
|
regex = re.compile( |
176
|
1 |
|
r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format( |
177
|
|
|
**cls.regexes |
178
|
1 |
|
), |
179
|
|
|
re.X, |
180
|
|
|
) |
181
|
1 |
|
return list(regex.search(track_line.strip()).groups()) |
182
|
|
|
|
183
|
|
|
# PARSE tracks info multiline |
184
|
|
|
@classmethod |
185
|
|
|
def parse_hhmmss_string(cls, tracks): |
186
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_formated time].\n |
187
|
|
|
:param str tracks: |
188
|
|
|
:return: |
189
|
|
|
""" |
190
|
|
|
return [_ for _ in cls._parse_string(tracks)] |
191
|
|
|
|
192
|
|
|
@classmethod |
193
|
|
|
def _parse_string(cls, tracks): |
194
|
|
|
""" |
195
|
|
|
:param str tracks: a '\n' separable string of lines coresponding to the tracks information |
196
|
|
|
:return: |
197
|
|
|
""" |
198
|
|
|
# regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$') |
199
|
|
|
for i, line in enumerate(_.strip() for _ in tracks.split('\n')): |
200
|
|
|
if line == '': |
201
|
|
|
continue |
202
|
|
|
try: |
203
|
|
|
yield cls._parse_track_line(line) |
204
|
|
|
except AttributeError as e: |
205
|
|
|
print( |
206
|
|
|
"Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format( |
207
|
|
|
i + 1, line |
208
|
|
|
) |
209
|
|
|
) |
210
|
|
|
raise e |
211
|
|
|
|
212
|
|
|
# CONVERT durations to timestamps tuples (segmentation start-end pair) |
213
|
|
|
@classmethod |
214
|
|
|
def convert_to_timestamps(cls, tracks_row_strings): |
215
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format) |
216
|
|
|
to a list of strings with each track's starting timestamp in hhmmss format.\n |
217
|
|
|
:param str tracks_row_strings: |
218
|
|
|
:return: the list of each track's timestamp |
219
|
|
|
:rtype: list |
220
|
|
|
""" |
221
|
|
|
lines = cls.parse_hhmmss_string(tracks_row_strings) # list of lists |
222
|
|
|
i = 1 |
223
|
|
|
timestamps = ['0:00'] |
224
|
|
|
while i < len(lines): |
225
|
|
|
timestamps.append(cls.add(timestamps[i - 1], lines[i - 1][-1])) |
226
|
|
|
i += 1 |
227
|
|
|
return timestamps |
228
|
|
|
|
229
|
|
|
@classmethod |
230
|
|
|
def add(cls, timestamp1, duration): |
231
|
|
|
""" |
232
|
|
|
:param str timestamp1: hh:mm:ss |
233
|
|
|
:param str duration: hh:mm:ss |
234
|
|
|
:return: hh:mm:ss |
235
|
|
|
:rtype: str |
236
|
|
|
""" |
237
|
|
|
return cls.hhmmss_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration)) |
238
|
|
|
|
239
|
|
|
########################### |
240
|
|
|
@staticmethod |
241
|
|
|
def to_seconds(timestamp): |
242
|
|
|
"""Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer""" |
243
|
|
|
return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))]) |
244
|
|
|
|
245
|
|
|
@staticmethod |
246
|
|
|
def hhmmss_format(seconds): |
247
|
|
|
"""Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation""" |
248
|
|
|
return time.strftime('%H:%M:%S', time.gmtime(seconds)) |
249
|
|
|
|