preprocessing.remove_emoji()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 38
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 2.0625

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 38
ccs 3
cts 4
cp 0.75
rs 10
c 0
b 0
f 0
cc 2
nop 1
crap 2.0625
1
"""
2
Preprocessing
3
4
....................................................................................................
5
MIT License
6
Copyright (c) 2021-2023 AUT Iran, Mohammad H Forouhesh
7
Copyright (c) 2021-2022 MetoData.ai, Mohammad H Forouhesh
8
....................................................................................................
9
This module contains various tools for text preprocessing.
10
"""
11
12 1
import re
13
14
15 1
def remove_emoji(text: str) -> str:
16
    """
17
    A function to remove emojis using regex
18
    :param text:    An input text.
19
    :return:        A text with removed emojis
20
    """
21 1
    emoji_pattern = re.compile(pattern="["
22
                                       u"\U0001F600-\U0001F64F"  # emoticons
23
                                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
24
                                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
25
                                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
26
                                       u"\U00002702-\U000027B0"
27
                                       u"\U000024C2-\U0001F251"
28
                                       u"\U0001F300-\U0001F5FF"
29
                                       u"\U0001F1E6-\U0001F1FF"
30
                                       u"\U00002700-\U000027BF"
31
                                       u"\U0001F900-\U0001F9FF"
32
                                       u"\U0001F600-\U0001F64F"
33
                                       u"\U0001F680-\U0001F6FF"
34
                                       u"\U00002600-\U000026FF"
35
                                       u'\u200d'
36
                                       u'’'
37
                                       u'£'
38
                                       u'\u2060-\u2069'
39
                                       u'í'
40
                                       u'ó'
41
                                       u'ú'
42
                                       u'á'
43
                                       u'–'
44
                                       u'“”‘‘‘'
45
                                       u'éàééàéééàéè'
46
                                       u'üöççəəəəçä'
47
                                       u'ışşƏıışşƏışêêñ'
48
                                       u'İğğ~•'
49
                                       u'⏯'
50
                                       "]+", flags=re.UNICODE)
51 1
    try:    return str(emoji_pattern.sub(r'', text))
52
    except: return ''
53
54
55 1
def remove_redundant_characters(text: str) -> str:
56
    """
57
    A tool to remove redundant and unwanted characters
58
    :param text:    An input text.
59
    :return:        A text with removed unwanted characters (punctuations, latin, etc.)
60
    """
61 1
    text = text.replace('\u200c', ' ')
62 1
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)  # Removed @mentions
63 1
    text = re.sub(r'_[A-Za-z0-9]+', ' ', text)  # Removed underlines
64 1
    text = re.sub(r'/(\r\n)+|\r+|\n+|\t+/', ' ', text)  # Removed \n
65 1
    text = re.sub(r'#', ' ', text)  # Removing the '#' symbol
66 1
    text = re.sub(r'RT[\s]+', ' ', text)  # Removing RT
67 1
    text = re.sub(r'https?:\/\/\S+', ' ', text)  # Remove the hyper link
68 1
    text = re.sub(r'\([ا-ی]{1,3}\)', ' ', text)  # Remove abbreviations
69 1
    text = re.sub(r"[\(\)]", " ", text)  # remove parantesis
70 1
    text = re.sub(r"\d|[۰-۹]", " ", text)
71 1
    text = re.sub(r"&|:", " ", text)
72 1
    text = re.sub(r"[A-Za-z]", " ", text)
73 1
    text = re.sub(r"[0-9]", " ", text)
74 1
    text = re.sub(r"\"", " ", text)
75 1
    text = re.sub(r"\'", " ", text)
76 1
    text = re.sub(r"_", " ", text)
77 1
    text = re.sub(r"—", " ", text)
78 1
    text = re.sub(r"-", " ", text)
79 1
    text = re.sub(r"@|=", " ", text)
80 1
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
81 1
    text = re.sub(r"{|}|;|\[|\]|\||؟|!|\+|\-|\*|\$", " ", text)
82 1
    text = re.sub(r"¹²|\/", " ", text)
83 1
    text = re.sub(r"»|>|<|«|,|؛|،|%|؟", " ", text)
84 1
    text = re.sub("\.|\^|,", " ", text)
85 1
    text = text.replace('…', ' ')
86 1
    text = text.replace('?', ' ')
87
    # text = ' '.join(list(map(lambda word: '' if len(word) < 3 else word, text.split())))
88 1
    return ' '.join([word for word in text.split(' ') if len(word) > 1
89
                     and False not in [char in 'آ ا ب پ ت ث ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی'
90
                                       for char in word]
91
                     and len(text.split(' ')) > 1])
92