1
|
|
|
""" |
2
|
|
|
Meta Class for Classifiers |
3
|
|
|
|
4
|
|
|
.................................................................................................... |
5
|
|
|
MIT License |
6
|
|
|
Copyright (c) 2021-2023 AUT Iran, Mohammad H Forouhesh |
7
|
|
|
Copyright (c) 2021-2022 MetoData.ai, Mohammad H Forouhesh |
8
|
|
|
.................................................................................................... |
9
|
|
|
This module abstracts classifiers. |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
import os |
13
|
1 |
|
import pickle |
14
|
1 |
|
from typing import List, Union |
15
|
|
|
|
16
|
1 |
|
import numpy as np |
17
|
1 |
|
import pandas as pd |
18
|
1 |
|
from tqdm import tqdm |
19
|
1 |
|
from sklearn.preprocessing import MinMaxScaler |
20
|
1 |
|
from sklearn.metrics import classification_report |
21
|
1 |
|
from sklearn.model_selection import train_test_split |
22
|
|
|
|
23
|
1 |
|
from ..api import get_resources |
24
|
1 |
|
from ..preprocess.preprocessing import remove_redundant_characters, remove_emoji |
25
|
1 |
|
from ..word2vec.w2v_emb import W2VEmb |
26
|
|
|
|
27
|
|
|
|
28
|
1 |
View Code Duplication |
class MetaClf: |
|
|
|
|
29
|
1 |
|
def __init__(self, classifier_instance, text_array: Union[List[str], pd.Series] = None, embedding_doc: list = None, |
|
|
|
|
30
|
|
|
labels: list = None, load_path: str = None): |
31
|
1 |
|
if not isinstance(text_array, pd.Series): text_array = pd.Series(text_array) |
|
|
|
|
32
|
|
|
|
33
|
1 |
|
self.clf = classifier_instance |
34
|
1 |
|
self.emb = W2VEmb() |
35
|
1 |
|
self.scaler = None |
36
|
1 |
|
self.dir_path = os.path.dirname( |
37
|
|
|
os.path.dirname( |
38
|
|
|
os.path.dirname( |
39
|
|
|
os.path.realpath(__file__)))) + "/" |
40
|
1 |
|
if load_path is not None: |
41
|
1 |
|
get_resources(self.dir_path, resource_name=load_path) |
42
|
1 |
|
self.load_model(load_path) |
43
|
|
|
else: |
44
|
|
|
assert text_array is not None and labels is not None |
45
|
|
|
text_array.fillna('', inplace=True) |
46
|
|
|
self.emb = W2VEmb(embedding_doc) |
47
|
|
|
|
48
|
|
|
encoded = list(map(self.emb.encode, tqdm(text_array))) |
49
|
|
|
self.labels = list(labels) |
50
|
|
|
self.scaler = self.prep_scaler(encoded) |
51
|
|
|
self.encoded_input = self.scaler.transform(encoded) |
52
|
|
|
|
53
|
1 |
|
def prep_scaler(self, encoded: List[np.ndarray]) -> MinMaxScaler: |
|
|
|
|
54
|
|
|
""" |
55
|
|
|
Fitting a Min-Max Scaler to use in the pipeline |
56
|
|
|
:param encoded: An array of numbers. |
57
|
|
|
:return: A MinMaxScaler |
58
|
|
|
""" |
59
|
|
|
scaler = MinMaxScaler() |
60
|
|
|
scaler.fit(encoded) |
61
|
|
|
return scaler |
62
|
|
|
|
63
|
1 |
|
def fit(self): |
|
|
|
|
64
|
|
|
X_train, X_test, y_train, y_test = train_test_split(self.encoded_input, self.labels, test_size=0.2, |
|
|
|
|
65
|
|
|
random_state=42, stratify=self.labels) |
66
|
|
|
self.clf.fit(X_train, y_train) |
67
|
|
|
print('score: ', self.clf.score(X_test, y_test)) |
68
|
|
|
print('============================trian============================') |
69
|
|
|
print(classification_report(y_train, self.clf.predict(X_train))) |
70
|
|
|
print('=============================test============================') |
71
|
|
|
print(classification_report(y_test, self.clf.predict(X_test))) |
72
|
|
|
return self.clf |
73
|
|
|
|
74
|
1 |
|
def load_model(self, load_path: str) -> None: |
75
|
|
|
""" |
76
|
|
|
A tool to load model from disk. |
77
|
|
|
:param load_path: Model path. |
78
|
|
|
:return: None |
79
|
|
|
""" |
80
|
|
|
|
81
|
1 |
|
loading_prep = lambda string: f'model_dir/{load_path}/{string}' |
82
|
1 |
|
self.clf.load_model(loading_prep('model.json')) |
83
|
1 |
|
self.emb.load(loading_prep('emb.pkl')) |
84
|
1 |
|
with open(loading_prep('scaler.pkl'), 'rb') as f: |
|
|
|
|
85
|
1 |
|
self.scaler = pickle.load(f) |
86
|
|
|
|
87
|
1 |
|
def save_model(self, save_path: str): |
88
|
|
|
""" |
89
|
|
|
A tool to save model to disk |
90
|
|
|
:param save_path: Saving path. |
91
|
|
|
:return: None. |
92
|
|
|
""" |
93
|
|
|
os.makedirs(f'model_dir/{save_path}', exist_ok=True) |
94
|
|
|
saving_prep = lambda string: f'model_dir/{save_path}/{string}' |
95
|
|
|
self.clf.save_model(saving_prep('model.json')) |
96
|
|
|
self.emb.save(saving_prep('emb.pkl')) |
97
|
|
|
with open(saving_prep('scaler.pkl'), 'wb') as f: |
|
|
|
|
98
|
|
|
pickle.dump(self.scaler, f, pickle.HIGHEST_PROTOCOL) |
99
|
|
|
|
100
|
1 |
|
def __getitem__(self, item: str) -> int: |
101
|
|
|
""" |
102
|
|
|
getitem overwritten |
103
|
|
|
:param item: Input text |
104
|
|
|
:return: Predicted class (0, 1). |
105
|
|
|
""" |
106
|
1 |
|
return self.predict(item) |
107
|
|
|
|
108
|
1 |
|
def predict(self, input_text: str) -> int: |
109
|
|
|
""" |
110
|
|
|
Prediction method. |
111
|
|
|
:param input_text: input text, string |
112
|
|
|
:return: predicted class. (0, 1) |
113
|
|
|
""" |
114
|
1 |
|
prep_text = remove_redundant_characters(remove_emoji(input_text)) |
115
|
1 |
|
vector = self.scaler.transform(self.emb.encode(prep_text).reshape(1, -1)) |
116
|
|
|
return self.clf.predict(vector)[0] |
117
|
|
|
|