preprocessing.remove_redundant_characters() - Code Metrics - MohammadForouhesh/tracking-policy-agendas - Measure and Improve Code Quality continuously with Scrutinizer

preprocessing.remove_redundant_characters() A
last analyzed 2023-01-07 20:54 UTC

↳ Parent: preprocessing

Complexity

Conditions

Size

Total Lines	37
Code Lines	31

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	28
CRAP Score	1

Importance

Changes

Metric	Value
eloc	31
dl	0
loc	37
ccs	28
cts	28
cp	1
rs	9.1359
c	0
b	0
f	0
cc	1
nop	1
crap	1

"""
Preprocessing

....................................................................................................
MIT License
Copyright (c) 2021-2023 AUT Iran, Mohammad H Forouhesh
Copyright (c) 2021-2022 MetoData.ai, Mohammad H Forouhesh
....................................................................................................
This module contains various tools for text preprocessing.
"""

import re


def remove_emoji(text: str) -> str:
    """
    A function to remove emojis using regex
    :param text:    An input text.
    :return:        A text with removed emojis
    """
    emoji_pattern = re.compile(pattern="["
                                       u"\U0001F600-\U0001F64F"  # emoticons
                                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                       u"\U00002702-\U000027B0"
                                       u"\U000024C2-\U0001F251"
                                       u"\U0001F300-\U0001F5FF"
                                       u"\U0001F1E6-\U0001F1FF"
                                       u"\U00002700-\U000027BF"
                                       u"\U0001F900-\U0001F9FF"
                                       u"\U0001F600-\U0001F64F"
                                       u"\U0001F680-\U0001F6FF"
                                       u"\U00002600-\U000026FF"
                                       u'\u200d'
                                       u'’'
                                       u'£'
                                       u'\u2060-\u2069'
                                       u'í'
                                       u'ó'
                                       u'ú'
                                       u'á'
                                       u'–'
                                       u'“”‘‘‘'
                                       u'éàééàéééàéè'
                                       u'üöççəəəəçä'
                                       u'ışşƏıışşƏışêêñ'
                                       u'İğğ~•'
                                       u'⏯'
                                       "]+", flags=re.UNICODE)
    try:    return str(emoji_pattern.sub(r'', text))
    except: return ''


def remove_redundant_characters(text: str) -> str:
    """
    A tool to remove redundant and unwanted characters
    :param text:    An input text.
    :return:        A text with removed unwanted characters (punctuations, latin, etc.)
    """
    text = text.replace('\u200c', ' ')
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)  # Removed @mentions
    text = re.sub(r'_[A-Za-z0-9]+', ' ', text)  # Removed underlines
    text = re.sub(r'/(\r\n)+|\r+|\n+|\t+/', ' ', text)  # Removed \n
    text = re.sub(r'#', ' ', text)  # Removing the '#' symbol
    text = re.sub(r'RT[\s]+', ' ', text)  # Removing RT
    text = re.sub(r'https?:\/\/\S+', ' ', text)  # Remove the hyper link
    text = re.sub(r'\([ا-ی]{1,3}\)', ' ', text)  # Remove abbreviations
    text = re.sub(r"[\(\)]", " ", text)  # remove parantesis
    text = re.sub(r"\d|[۰-۹]", " ", text)
    text = re.sub(r"&|:", " ", text)
    text = re.sub(r"[A-Za-z]", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"_", " ", text)
    text = re.sub(r"—", " ", text)
    text = re.sub(r"-", " ", text)
    text = re.sub(r"@|=", " ", text)
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = re.sub(r"{|}|;|\[|\]|\||؟|!|\+|\-|\*|\$", " ", text)
    text = re.sub(r"¹²|\/", " ", text)
    text = re.sub(r"»|>|<|«|,|؛|،|%|؟", " ", text)
    text = re.sub("\.|\^|,", " ", text)
    text = text.replace('…', ' ')
    text = text.replace('?', ' ')
    # text = ' '.join(list(map(lambda word: '' if len(word) < 3 else word, text.split())))
    return ' '.join([word for word in text.split(' ') if len(word) > 1
                     and False not in [char in 'آ ا ب پ ت ث ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی'
                                       for char in word]
                     and len(text.split(' ')) > 1])


1		"""
2		Preprocessing
3
4		....................................................................................................
5		MIT License
6		Copyright (c) 2021-2023 AUT Iran, Mohammad H Forouhesh
7		Copyright (c) 2021-2022 MetoData.ai, Mohammad H Forouhesh
8		....................................................................................................
9		This module contains various tools for text preprocessing.
10		"""
11
12	1	import re
13
14
15	1	def remove_emoji(text: str) -> str:
16		"""
17		A function to remove emojis using regex
18		:param text: An input text.
19		:return: A text with removed emojis
20		"""
21	1	emoji_pattern = re.compile(pattern="["
22		u"\U0001F600-\U0001F64F" # emoticons
23		u"\U0001F300-\U0001F5FF" # symbols & pictographs
24		u"\U0001F680-\U0001F6FF" # transport & map symbols
25		u"\U0001F1E0-\U0001F1FF" # flags (iOS)
26		u"\U00002702-\U000027B0"
27		u"\U000024C2-\U0001F251"
28		u"\U0001F300-\U0001F5FF"
29		u"\U0001F1E6-\U0001F1FF"
30		u"\U00002700-\U000027BF"
31		u"\U0001F900-\U0001F9FF"
32		u"\U0001F600-\U0001F64F"
33		u"\U0001F680-\U0001F6FF"
34		u"\U00002600-\U000026FF"
35		u'\u200d'
36		u'’'
37		u'£'
38		u'\u2060-\u2069'
39		u'í'
40		u'ó'
41		u'ú'
42		u'á'
43		u'–'
44		u'“”‘‘‘'
45		u'éàééàéééàéè'
46		u'üöççəəəəçä'
47		u'ışşƏıışşƏışêêñ'
48		u'İğğ~•'
49		u'⏯'
50		"]+", flags=re.UNICODE)
51	1	try: return str(emoji_pattern.sub(r'', text))
52		except: return ''
53
54
55	1	def remove_redundant_characters(text: str) -> str:
56		"""
57		A tool to remove redundant and unwanted characters
58		:param text: An input text.
59		:return: A text with removed unwanted characters (punctuations, latin, etc.)
60		"""
61	1	text = text.replace('\u200c', ' ')
62	1	text = re.sub(r'@[A-Za-z0-9]+', ' ', text) # Removed @mentions
63	1	text = re.sub(r'_[A-Za-z0-9]+', ' ', text) # Removed underlines
64	1	text = re.sub(r'/(\r\n)+\|\r+\|\n+\|\t+/', ' ', text) # Removed \n
65	1	text = re.sub(r'#', ' ', text) # Removing the '#' symbol
66	1	text = re.sub(r'RT[\s]+', ' ', text) # Removing RT
67	1	text = re.sub(r'https?:\/\/\S+', ' ', text) # Remove the hyper link
68	1	text = re.sub(r'\([ا-ی]{1,3}\)', ' ', text) # Remove abbreviations
69	1	text = re.sub(r"[\(\)]", " ", text) # remove parantesis
70	1	text = re.sub(r"\d\|[۰-۹]", " ", text)
71	1	text = re.sub(r"&\|:", " ", text)
72	1	text = re.sub(r"[A-Za-z]", " ", text)
73	1	text = re.sub(r"[0-9]", " ", text)
74	1	text = re.sub(r"\"", " ", text)
75	1	text = re.sub(r"\'", " ", text)
76	1	text = re.sub(r"_", " ", text)
77	1	text = re.sub(r"—", " ", text)
78	1	text = re.sub(r"-", " ", text)
79	1	text = re.sub(r"@\|=", " ", text)
80	1	text = re.sub(r"^\d+\s\|\s\d+\s\|\s\d+$", " ", text)
81	1	text = re.sub(r"{\|}\|;\|\[\|\]\|\\|\|؟\|!\|\+\|\-\|\*\|\$", " ", text)
82	1	text = re.sub(r"¹²\|\/", " ", text)
83	1	text = re.sub(r"»\|>\|<\|«\|,\|؛\|،\|%\|؟", " ", text)
84	1	text = re.sub("\.\|\^\|,", " ", text)
85	1	text = text.replace('…', ' ')
86	1	text = text.replace('?', ' ')
87		# text = ' '.join(list(map(lambda word: '' if len(word) < 3 else word, text.split())))
88	1	return ' '.join([word for word in text.split(' ') if len(word) > 1
89		and False not in [char in 'آ ا ب پ ت ث ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی'
90		for char in word]
91		and len(text.split(' ')) > 1])
92

MohammadForouhesh / tracking-policy-agendas

preprocessing.remove_redundant_characters() A last analyzed 2023-01-07 20:54 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

preprocessing.remove_redundant_characters() A
last analyzed 2023-01-07 20:54 UTC