music_album_creation.tracks_parsing.RegexSequence.search_n_dict() - Code Metrics - boromir674/music-album-creator - Measure and Improve Code Quality continuously with Scrutinizer

RegexSequence.search_n_dict() A
last analyzed 2023-04-15 10:51 UTC

↳ Parent: music_album_creation.tracks_parsing

Complexity

Conditions

Size

Total Lines	14
Code Lines	11

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	6
CRAP Score	2

Importance

Changes

Metric	Value
cc	2
eloc	11
nop	2
dl	0
loc	14
ccs	6
cts	6
cp	1
crap	2
rs	9.85
c	0
b	0
f	0

# -*- coding: utf-8 -*-

import os
import re
import time


class StringToDictParser(object):
    """Parses album information out of video title string"""

    check = re.compile(r'^s([1-9]\d*)$')

    def __init__(self, entities, separators):
        if not all(type(x) == str for x in separators):
            raise RuntimeError
        self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()}
        self.separators = separators

    def __call__(self, *args, **kwargs):
        title = args[0]
        design = kwargs['design']
        if not all(
            0 <= len(x) <= len(self.entities) + len(self.separators)
            and all(type(y) == str for y in x)
            for x in design
        ):
            raise RuntimeError
        if not all(
            all(StringToDictParser.check.match(y) for y in x if y.startswith('s'))
            for x in design
        ):
            raise RuntimeError
        rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design]
        return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x))

    def _yield_reg_comp(self, kati):
        for k in kati:
            if k.startswith('s'):
                yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1]
            else:
                yield self.entities[k]


class AlbumInfoEntity(object):
    def __init__(self, name, reg):
        self.name = name
        self.reg = reg

    def __str__(self):
        return self.reg


class RegexSequence(object):
    def __init__(self, data):
        self._keys = [d.name for d in data if hasattr(d, 'name')]
        self._regex = r'{}'.format(''.join(str(d) for d in data))

    def search_n_dict(self, string):
        return dict(
            _
            for _ in zip(
                self._keys,
                list(
                    getattr(
                        re.search(self._regex, string),
                        'groups',
                        lambda: len(self._keys) * [''],
                    )()
                ),
            )
            if _[1]
        )


class StringParser(object):
    __instance = None
    # we take care of compiling the below regexes with the re.X flag
    # because they contain whitespaces on purpose for better readability
    # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments

    # r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})"
    regexes = {
        'track_number': r'\d{1,2}',  # we know this will try to match as many as possible with back-tracking ;-)
        'sep1': r"(?: [\t\ ]* [\.\-\,)]+ )? [\t ]*",
        'track_word_first_char': r"[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE]",
        'track_word_char': r"[\.\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]",
        # 'track_word': r"\(?[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE][\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]*\)?",
        'track_sep': r'[\t\ ,]+',
        'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* | [\t\ ]+ )',
        'extension': r'\.mp[34]',
        'hhmmss': r'(?:\d?\d:)*\d?\d',
    }

    ## to parse from youtube video title string
    sep1 = r'[\t ]*[\-\.][\t ]*'
    sep2 = r'[\t \-\.]+'
    year = r'\(?(\d{4})\)?'
    art = r'([\w ]*\w)'
    alb = r'([\w ]*\w)'

    album_info_parser = StringToDictParser(
        {'artist': art, 'album': alb, 'year': year}, [sep1, sep2]
    )

    def __new__(cls, *args, **kwargs):
        if not cls.__instance:
            cls.__instance = super(cls, StringParser).__new__(cls)
            cls.regexes[
                'track_word'
            ] = r'\(?{track_word_first_char}{track_word_char}*\)?'.format(**cls.regexes)
            cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format(
                **cls.regexes
            )
        return cls.__instance

    ## STRING TO DICT
    @classmethod
    def parse_album_info(cls, video_title):
        """Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n
        Can parse patters:
         - Artist Album Year\n
         - Artist Album\n
         - Album Year\n
         - Album\n
        :param str video_title:
        :return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'}
        :rtype: dict
        """
        return cls.album_info_parser(
            video_title,
            design=[
                ['artist', 's1', 'album', 's2', 'year'],
                ['artist', 's1', 'album'],
                ['album', 's2', 'year'],
                ['album'],
            ],
        )

    # Uses the cls.regexes
    # PARSE filenames
    @classmethod
    def parse_track_number_n_name(cls, file_name):
        """Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!"""
        return dict(
            zip(
                ['track_number', 'track_name'],
                list(
                    re.compile(
                        r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format(
                            **cls.regexes
                        ),
                        re.X,  # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
                    )
                    .search(os.path.basename(file_name))
                    .groups()
                ),
            )
        )
        # return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups())))

    # Uses the cls.regexes
    @classmethod
    def _parse_track_line(cls, track_line):
        """
        Parses a string line such as '01. Doteru 3:45' into ['Doteru', '3:45']\n
        :param track_line:
        :return: the parsed items
        :rtype: list
        """
        # regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t]*[\.\-,][\ \t]*|[\t\ ]+))?  # potential track number (eg 01) included is ignored
        #                             ([\w\'\(\) \-’]*[\w)])                       # track name
        #                             (?:[\t ]+|[\t ]*[\-\.]+[\t ]*)            # separator between name and time
        #                             ((?:\d?\d:)*\d?\d)$                       # time in hh:mm:ss format""", re.X)
        # regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss))
        regex = re.compile(
            r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format(
                **cls.regexes
            ),
            re.X,
        )
        return list(regex.search(track_line.strip()).groups())

    # PARSE tracks info multiline
    @classmethod
    def parse_hhmmss_string(cls, tracks):
        """Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_formated time].\n
        :param str tracks:
        :return:
        """
        return [_ for _ in cls._parse_string(tracks)]

    @classmethod
    def _parse_string(cls, tracks):
        """
        :param str tracks: a '\n' separable string of lines coresponding to the tracks information
        :return:
        """
        # regex = re.compile('(?:\d{1,2}[ \t]*[\.\-,][ \t]*|[\t ]+)?([\w\'\(\) ]*[\w)])' + cls.sep + '((?:\d?\d:)*\d?\d)$')
        for i, line in enumerate(_.strip() for _ in tracks.split('\n')):
            if line == '':
                continue
            try:
                yield cls._parse_track_line(line)
            except AttributeError as e:
                print(
                    "Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(
                        i + 1, line
                    )
                )
                raise e

    # CONVERT durations to timestamps tuples (segmentation start-end pair)
    @classmethod
    def convert_to_timestamps(cls, tracks_row_strings):
        """Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format)
        to a list of strings with each track's starting timestamp in hhmmss format.\n
        :param str tracks_row_strings:
        :return: the list of each track's timestamp
        :rtype: list
        """
        lines = cls.parse_hhmmss_string(tracks_row_strings)  # list of lists
        i = 1
        timestamps = ['0:00']
        while i < len(lines):
            timestamps.append(cls.add(timestamps[i - 1], lines[i - 1][-1]))
            i += 1
        return timestamps

    @classmethod
    def add(cls, timestamp1, duration):
        """
        :param str timestamp1: hh:mm:ss
        :param str duration: hh:mm:ss
        :return: hh:mm:ss
        :rtype: str
        """
        return cls.hhmmss_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration))

    ###########################
    @staticmethod
    def to_seconds(timestamp):
        """Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer"""
        return sum([60**i * int(x) for i, x in enumerate(reversed(timestamp.split(':')))])

    @staticmethod
    def hhmmss_format(seconds):
        """Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation"""
        return time.strftime('%H:%M:%S', time.gmtime(seconds))


1		# -- coding: utf-8 --
2
3	1	import os
4	1	import re
5	1	import time
6
7
8	1	class StringToDictParser(object):
9		"""Parses album information out of video title string"""
10	1
11		check = re.compile(r'^s([1-9]\d*)$')
12	1
13	1	def __init__(self, entities, separators):
14		if not all(type(x) == str for x in separators):
15	1	raise RuntimeError
16	1	self.entities = {k: AlbumInfoEntity(k, v) for k, v in entities.items()}
17		self.separators = separators
18	1
19	1	def __call__(self, args, *kwargs):
20	1	title = args[0]
21	1	design = kwargs['design']
22		if not all(
23	1	0 <= len(x) <= len(self.entities) + len(self.separators)
24		and all(type(y) == str for y in x)
25	1	for x in design
26	1	):
27		raise RuntimeError
28	1	if not all(
29	1	all(StringToDictParser.check.match(y) for y in x if y.startswith('s'))
30	1	for x in design
31	1	):
32		raise RuntimeError
33	1	rregs = [RegexSequence([_ for _ in self._yield_reg_comp(d)]) for d in design]
34		return max([r.search_n_dict(title) for r in rregs], key=lambda x: len(x))
35	1
36	1	def _yield_reg_comp(self, kati):
37	1	for k in kati:
38	1	if k.startswith('s'):
39		yield self.separators[int(StringToDictParser.check.match(k).group(1)) - 1]
40	1	else:
41	1	yield self.entities[k]
42
43
44	1	class AlbumInfoEntity(object):
45	1	def __init__(self, name, reg):
46	1	self.name = name
47	1	self.reg = reg
48
49	1	def __str__(self):
50	1	return self.reg
51
52
53	1	class RegexSequence(object):
54	1	def __init__(self, data):
55	1	self._keys = [d.name for d in data if hasattr(d, 'name')]
56		self._regex = r'{}'.format(''.join(str(d) for d in data))
57
58		def search_n_dict(self, string):
59		return dict(
60		_
61		for _ in zip(
62		self._keys,
63		list(
64	1	getattr(
65	1	re.search(self._regex, string),
66	1	'groups',
67	1	lambda: len(self._keys) * [''],
68	1	)()
69		),
70	1	)
71		if _[1]
72	1	)
73	1
74	1
75	1	class StringParser(object):
76	1	__instance = None
77		# we take care of compiling the below regexes with the re.X flag
78		# because they contain whitespaces on purpose for better readability
79	1	# VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
80
81		# r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})"
82		regexes = {
83		'track_number': r'\d{1,2}', # we know this will try to match as many as possible with back-tracking ;-)
84		'sep1': r"(?: [\t\ ]* [\.\-\,)]+ )? [\t ]*",
85		'track_word_first_char': r"[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE]",
86		'track_word_char': r"[\.\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]",
87		# 'track_word': r"\(?[\wα-ωΑ-Ω'\x86-\xce\u0384-\u03CE][\w\-’':!\xc3\xa8α-ωΑ\-Ω\x86-\xce\u0384-\u03CE]*\)?",
88		'track_sep': r'[\t\ ,]+',
89		'sep2': r'(?: [\t\ ]* [\-.]+ [\t\ ]* \| [\t\ ]+ )',
90		'extension': r'\.mp[34]',
91	1	'hhmmss': r'(?:\d?\d:)*\d?\d',
92		}
93
94		## to parse from youtube video title string
95		sep1 = r'[\t ][\-\.][\t ]'
96	1	sep2 = r'[\t \-\.]+'
97		year = r'\(?(\d{4})\)?'
98		art = r'([\w ]*\w)'
99	1	alb = r'([\w ]*\w)'
100
101		album_info_parser = StringToDictParser(
102		{'artist': art, 'album': alb, 'year': year}, [sep1, sep2]
103		)
104
105	1	def __new__(cls, args, *kwargs):
106		if not cls.__instance:
107		cls.__instance = super(cls, StringParser).__new__(cls)
108		cls.regexes[
109		'track_word'
110		] = r'\(?{track_word_first_char}{track_word_char}\)?'.format(*cls.regexes)
111	1	cls.regexes['track_name'] = r'{track_word}(?:{track_sep}{track_word})*'.format(
112		**cls.regexes
113	1	)
114		return cls.__instance
115
116		## STRING TO DICT
117		@classmethod
118		def parse_album_info(cls, video_title):
119		"""Call to parse a video title string into a hash (dictionary) of potentially all 'artist', 'album' and 'year' fields.\n
120	1	Can parse patters:
121	1	- Artist Album Year\n
122	1	- Artist Album\n
123	1	- Album Year\n
124	1	- Album\n
125		:param str video_title:
126		:return: the exracted values as a dictionary having maximally keys: {'artist', 'album', 'year'}
127		:rtype: dict
128		"""
129	1	return cls.album_info_parser(
130		video_title,
131		design=[
132		['artist', 's1', 'album', 's2', 'year'],
133		['artist', 's1', 'album'],
134		['album', 's2', 'year'],
135		['album'],
136		],
137		)
138
139		# Uses the cls.regexes
140		# PARSE filenames
141		@classmethod
142	1	def parse_track_number_n_name(cls, file_name):
143	1	"""Call this method to get a dict like {'track_number': 'number', 'track_name': 'name'} from input file name with format like '1. - Loyal to the Pack.mp3'; number must be included!"""
144		return dict(
145		zip(
146	1	['track_number', 'track_name'],
147		list(
148		re.compile(
149		r"(?: ({track_number}) {sep1})? ( {track_name} ) {extension}$".format(
150		**cls.regexes
151		),
152		re.X, # VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
153		)
154	1	.search(os.path.basename(file_name))
155	1	.groups()
156	1	),
157	1	)
158	1	)
159	1	# return dict(zip(['track_number', 'track_name'], list(re.compile(r'({}){}({}){}$'.format(cls.track_number, cls.sep2, cls.track_name, cls.extension)).search(file_name).groups())))
160	1
161		# Uses the cls.regexes
162	1	@classmethod
163		def _parse_track_line(cls, track_line):
164		"""
165		Parses a string line such as '01. Doteru 3:45' into ['Doteru', '3:45']\n
166		:param track_line:
167		:return: the parsed items
168		:rtype: list
169		"""
170	1	# regex = re.compile(r"""^(?:\d{1,2}(?:[\ \t][\.\-,][\ \t]\|[\t\ ]+))? # potential track number (eg 01) included is ignored
171		# ([\w\'\(\) \-’]*[\w)]) # track name
172		# (?:[\t ]+\|[\t ][\-\.]+[\t ]) # separator between name and time
173	1	# ((?:\d?\d:)*\d?\d)$ # time in hh:mm:ss format""", re.X)
174		# regex = re.compile(r"^(?:{}{})?({}){}({})$".format(cls.track_number, cls.number_name_sep, cls.track_name, cls.sep, cls.hhmmss))
175		regex = re.compile(
176	1	r"(?: {track_number} {sep1})? ( {track_name} ) {sep2} ({hhmmss})".format(
177		**cls.regexes
178	1	),
179		re.X,
180		)
181	1	return list(regex.search(track_line.strip()).groups())
182
183		# PARSE tracks info multiline
184		@classmethod
185		def parse_hhmmss_string(cls, tracks):
186		"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from video description) to a list of lists. Inner lists contains [track_name, hhmmss_formated time].\n
187		:param str tracks:
188		:return:
189		"""
190		return [_ for _ in cls._parse_string(tracks)]
191
192		@classmethod
193		def _parse_string(cls, tracks):
194		"""
195		:param str tracks: a '\n' separable string of lines coresponding to the tracks information
196		:return:
197		"""
198		# regex = re.compile('(?:\d{1,2}[ \t][\.\-,][ \t]\|[\t ]+)?([\w\'\(\) ][\w)])' + cls.sep + '((?:\d?\d:)\d?\d)$')
199		for i, line in enumerate(_.strip() for _ in tracks.split('\n')):
200		if line == '':
201		continue
202		try:
203		yield cls._parse_track_line(line)
204		except AttributeError as e:
205		print(
206		"Couldn't parse line {}: '{}'. Please use a format as 'trackname - 3:45'".format(
207		i + 1, line
208		)
209		)
210		raise e
211
212		# CONVERT durations to timestamps tuples (segmentation start-end pair)
213		@classmethod
214		def convert_to_timestamps(cls, tracks_row_strings):
215		"""Call this method to transform a '\n' separabale string of album tracks (eg copy-pasted from the youtube video description) that represents durations (in hhmmss format)
216		to a list of strings with each track's starting timestamp in hhmmss format.\n
217		:param str tracks_row_strings:
218		:return: the list of each track's timestamp
219		:rtype: list
220		"""
221		lines = cls.parse_hhmmss_string(tracks_row_strings) # list of lists
222		i = 1
223		timestamps = ['0:00']
224		while i < len(lines):
225		timestamps.append(cls.add(timestamps[i - 1], lines[i - 1][-1]))
226		i += 1
227		return timestamps
228
229		@classmethod
230		def add(cls, timestamp1, duration):
231		"""
232		:param str timestamp1: hh:mm:ss
233		:param str duration: hh:mm:ss
234		:return: hh:mm:ss
235		:rtype: str
236		"""
237		return cls.hhmmss_format(cls.to_seconds(timestamp1) + cls.to_seconds(duration))
238
239		###########################
240		@staticmethod
241		def to_seconds(timestamp):
242		"""Call this method to transform a hh:mm:ss formatted string timestamp to its equivalent duration in seconds as an integer"""
243		return sum([60*i int(x) for i, x in enumerate(reversed(timestamp.split(':')))])
244
245		@staticmethod
246		def hhmmss_format(seconds):
247		"""Call this method to transform an integer representing time duration in seconds to its equivalent hh:mm:ss formatted string representeation"""
248		return time.strftime('%H:%M:%S', time.gmtime(seconds))
249

boromir674 / music-album-creator

RegexSequence.search_n_dict() A last analyzed 2023-04-15 10:51 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

RegexSequence.search_n_dict() A
last analyzed 2023-04-15 10:51 UTC