1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
1 |
|
import os |
4
|
1 |
|
import re |
5
|
1 |
|
import time |
6
|
|
|
|
7
|
|
|
|
8
|
1 |
|
class StringToDictParser(object): |
9
|
|
|
"""Parses album information out of video title string""" |
10
|
1 |
|
check = re.compile(r'^s([1-9]\d*)$') |
11
|
|
|
|
12
|
1 |
|
def __init__(self, entities, separators): |
13
|
1 |
|
if not all(type(x) == str for x in separators): |
14
|
|
|
raise RuntimeError |
15
|
1 |
|
self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()} |
16
|
1 |
|
self.separators = separators |
17
|
|
|
|
18
|
1 |
|
def __call__(self, *args, **kwargs): |
19
|
1 |
|
title = args[0] |
20
|
1 |
|
design = kwargs['design'] |
21
|
1 |
|
if not all(0 <= len(x) <= len(self.entities) + len(self.separators) and all(type(y) == str for y in x) for x in design): |
22
|
|
|
raise RuntimeError |
23
|
1 |
|
if not all(all(StringToDictParser.check.match(y) for y in x if y.startswith('s')) for x in design): |
24
|
|
|
raise RuntimeError |
25
|
1 |
|
rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design] |
26
|
1 |
|
return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x)) |
27
|
|
|
|
28
|
1 |
|
def _yield_reg_comp(self, kati): |
29
|
1 |
|
for k in kati: |
30
|
1 |
|
if k.startswith('s'): |
31
|
1 |
|
yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1] |
32
|
|
|
else: |
33
|
1 |
|
yield self.entities[k] |
34
|
|
|
|
35
|
1 |
|
class AlbumInfoEntity(object): |
36
|
1 |
|
def __init__(self, name, reg): |
37
|
1 |
|
self.name = name |
38
|
1 |
|
self.reg = reg |
39
|
|
|
|
40
|
1 |
|
def __str__(self): |
41
|
1 |
|
return self.reg |
42
|
|
|
|
43
|
|
|
|
44
|
1 |
|
class RegexSequence(object): |
45
|
1 |
|
def __init__(self, data): |
46
|
1 |
|
self._keys = [d.name for d in data if hasattr(d, 'name')] |
47
|
1 |
|
self._regex = r'{}'.format(''.join(str(d) for d in data)) |
48
|
|
|
|
49
|
1 |
|
def search_n_dict(self, string): |
50
|
1 |
|
return dict(_ for _ in zip(self._keys, list(getattr(re.search(self._regex, string), 'groups', lambda: len(self._keys)*[''])())) if _[1]) |
51
|
|
|
|
52
|
|
|
|
53
|
1 |
|
class StringParser(object): |
54
|
1 |
|
__instance = None |
55
|
1 |
|
regexes = {'track_number': r'\d{1,2}', |
56
|
|
|
'sep1': r"(?: [\t\ ]* [\.\-\)]+ )? [\t ]*", |
57
|
|
|
'track_word': r"\(?[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE][\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]*\)?", |
58
|
|
|
'track_sep': r'[\t\ ,]+', |
59
|
|
|
'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )', |
60
|
|
|
'extension': r'\.mp3', |
61
|
|
|
'hhmmss': r'(?:\d?\d:)*\d?\d'} |
62
|
|
|
|
63
|
|
|
## to parse from youtube video title string |
64
|
1 |
|
sep1 = r'[\t ]*[\-\.][\t ]*' |
65
|
1 |
|
sep2 = r'[\t \-\.]+' |
66
|
1 |
|
year = r'\(?(\d{4})\)?' |
67
|
1 |
|
art = r'([\w ]*\w)' |
68
|
1 |
|
alb = r'([\w ]*\w)' |
69
|
|
|
|
70
|
1 |
|
album_info_parser = StringToDictParser({'artist': art, 'album': alb, 'year': year}, [sep1, sep2]) |
71
|
|
|
|
72
|
1 |
|
def __new__(cls, *args, **kwargs): |
73
|
1 |
|
if not cls.__instance: |
74
|
1 |
|
cls.__instance = super(cls, StringParser).__new__(cls) |
75
|
1 |
|
cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format(**cls.regexes) |
76
|
1 |
|
return cls.__instance |
77
|
|
|
|
78
|
|
|
## STRING TO DICT |
79
|
1 |
|
@classmethod |
80
|
|
|
def parse_album_info(cls, video_title): |
81
|
|
|
"""Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n |
82
|
|
|
Can parse patters: |
83
|
|
|
- Artist Album Year\n |
84
|
|
|
- Artist Album\n |
85
|
|
|
- Album Year\n |
86
|
|
|
- Album\n |
87
|
|
|
:param str video_title: |
88
|
|
|
:return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'} |
89
|
|
|
:rtype: dict |
90
|
|
|
""" |
91
|
1 |
|
return cls.album_info_parser(video_title, design=[['artist', 's1', 'album', 's2', 'year'], |
92
|
|
|
['artist', 's1', 'album'], |
93
|
|
|
['album', 's2', 'year'], |
94
|
|
|
['album']]) |
95
|
|
|
# PARSE filenames |
96
|
1 |
|
@classmethod |
97
|
|
|
def parse_track_number_n_name(cls, file_name): |
98
|
|
|
"""Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!""" |
99
|
1 |
|
return dict(zip(['track_number', 'track_name'], list( |
100
|
|
|
re.compile(r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format(**cls.regexes), re.X).search( |
101
|
|
|
os.path.basename(file_name)).groups()))) |
102
|
|
|
# return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups()))) |
103
|
|
|
|
104
|
|
|
# PARSE tracks info multiline |
105
|
1 |
|
@classmethod |
106
|
|
|
def parse_hhmmss_string(cls, tracks): |
107
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_formated time].\n |
108
|
|
|
:param str tracks: |
109
|
|
|
:return: |
110
|
|
|
""" |
111
|
1 |
|
return [_ for _ in cls._parse_string(tracks)] |
112
|
|
|
|
113
|
1 |
|
@classmethod |
114
|
|
|
def _parse_string(cls, tracks): |
115
|
|
|
""" |
116
|
|
|
:param str tracks: a '\n' separable string of lines coresponding to the tracks information |
117
|
|
|
:return: |
118
|
|
|
""" |
119
|
|
|
# regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$') |
120
|
1 |
|
for i, line in enumerate(_.strip() for _ in tracks.split('\n')): |
121
|
1 |
|
if line == '': |
122
|
1 |
|
continue |
123
|
1 |
|
try: |
124
|
1 |
|
yield cls._parse_track_line(line) |
125
|
|
|
except AttributeError as e: |
126
|
|
|
print("Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(i + 1, line)) |
127
|
|
|
raise e |
128
|
|
|
|
129
|
1 |
|
@classmethod |
130
|
|
|
def _parse_track_line(cls, track_line): |
131
|
|
|
""" |
132
|
|
|
Parses a string line such as '01. Doteru 3:45' into ['Doteru', '3:45']\n |
133
|
|
|
:param track_line: |
134
|
|
|
:return: the parsed items |
135
|
|
|
:rtype: list |
136
|
|
|
""" |
137
|
|
|
# regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))? # potential track number (eg 01) included is ignored |
138
|
|
|
# ([\w\'\(\) \-’]*[\w)]) # track name |
139
|
|
|
# (?:[\t ]+|[\t ]*[\-\.]+[\t ]*) # separator between name and time |
140
|
|
|
# ((?:\d?\d:)*\d?\d)$ # time in hh:mm:ss format""", re.X) |
141
|
|
|
# regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss)) |
142
|
1 |
|
regex = re.compile(r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format(**cls.regexes), re.X) |
143
|
1 |
|
return list(regex.search(track_line.strip()).groups()) |
144
|
|
|
|
145
|
|
|
# CONVERT durations to timestamps tuples (segmentation start-end pair) |
146
|
1 |
|
@classmethod |
147
|
|
|
def convert_to_timestamps(cls, tracks_row_strings): |
148
|
|
|
"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format) |
149
|
|
|
to a list of strings with each track's starting timestamp in hhmmss format.\n |
150
|
|
|
:param str tracks_row_strings: |
151
|
|
|
:return: the list of each track's timestamp |
152
|
|
|
:rtype: list |
153
|
|
|
""" |
154
|
1 |
|
lines = cls.parse_hhmmss_string(tracks_row_strings) # list of lists |
155
|
1 |
|
i = 1 |
156
|
1 |
|
timestamps = ['0:00'] |
157
|
1 |
|
while i < len(lines): |
158
|
1 |
|
timestamps.append(cls.add(timestamps[i-1], lines[i-1][-1])) |
159
|
1 |
|
i += 1 |
160
|
1 |
|
return timestamps |
161
|
|
|
|
162
|
1 |
|
@classmethod |
163
|
|
|
def add(cls, timestamp1, duration): |
164
|
|
|
""" |
165
|
|
|
:param str timestamp1: hh:mm:ss |
166
|
|
|
:param str duration: hh:mm:ss |
167
|
|
|
:return: hh:mm:ss |
168
|
|
|
:rtype: str |
169
|
|
|
""" |
170
|
1 |
|
return cls.hhmmss_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration)) |
171
|
|
|
|
172
|
|
|
########################### |
173
|
1 |
|
@staticmethod |
174
|
|
|
def to_seconds(timestamp): |
175
|
|
|
"""Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer""" |
176
|
1 |
|
return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))]) |
177
|
|
|
|
178
|
1 |
|
@staticmethod |
179
|
|
|
def hhmmss_format(seconds): |
180
|
|
|
"""Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation""" |
181
|
|
|
return time.strftime('%H:%M:%S', time.gmtime(seconds)) |
182
|
|
|
|