1
|
1 |
|
import os |
|
|
|
|
2
|
1 |
|
import pickle |
3
|
1 |
|
import pandas as pd |
4
|
1 |
|
from tqdm import tqdm |
5
|
1 |
|
from sklearn.preprocessing import MinMaxScaler |
6
|
1 |
|
from sklearn.metrics import classification_report |
7
|
1 |
|
from sklearn.model_selection import train_test_split |
8
|
|
|
|
9
|
1 |
|
from ..api import get_resources |
10
|
1 |
|
from ..preprocess.preprocessing import remove_redundant_characters, remove_emoji |
11
|
1 |
|
from ..word2vec.w2v_emb import W2VEmb |
12
|
|
|
|
13
|
|
|
|
14
|
1 |
View Code Duplication |
class MetaClf: |
|
|
|
|
15
|
1 |
|
def __init__(self, classifier_instance, text_array: list = None, embedding_doc: list = None, labels: list = None, load_path: str = None): |
|
|
|
|
16
|
1 |
|
if not isinstance(text_array, pd.Series): text_array = pd.Series(text_array) |
|
|
|
|
17
|
|
|
|
18
|
1 |
|
self.clf = classifier_instance |
19
|
1 |
|
self.emb = W2VEmb() |
20
|
1 |
|
self.scaler = None |
21
|
1 |
|
self.dir_path = os.path.dirname( |
22
|
|
|
os.path.dirname( |
23
|
|
|
os.path.dirname( |
24
|
|
|
os.path.realpath(__file__)))) + "/" |
25
|
1 |
|
if load_path is not None: |
26
|
1 |
|
get_resources(self.dir_path, resource_name=load_path) |
27
|
1 |
|
self.load_model(load_path) |
28
|
|
|
else: |
29
|
|
|
assert text_array is not None and labels is not None |
30
|
|
|
text_array.fillna('', inplace=True) |
31
|
|
|
self.emb = W2VEmb(embedding_doc) |
32
|
|
|
|
33
|
|
|
encoded = list(map(self.emb.encode, tqdm(text_array))) |
34
|
|
|
self.labels = list(labels) |
35
|
|
|
self.scaler = self.prep_scaler(encoded) |
36
|
|
|
self.encoded_input = self.scaler.transform(encoded) |
37
|
|
|
|
38
|
1 |
|
def prep_scaler(self, encoded): |
|
|
|
|
39
|
|
|
scaler = MinMaxScaler() |
40
|
|
|
scaler.fit(encoded) |
41
|
|
|
return scaler |
42
|
|
|
|
43
|
1 |
|
def fit(self): |
|
|
|
|
44
|
|
|
X_train, X_test, y_train, y_test = train_test_split(self.encoded_input, self.labels, test_size=0.2, |
|
|
|
|
45
|
|
|
random_state=42, stratify=self.labels) |
46
|
|
|
self.clf.fit(X_train, y_train) |
47
|
|
|
print('score: ', self.clf.score(X_test, y_test)) |
48
|
|
|
print('============================trian============================') |
49
|
|
|
print(classification_report(y_train, self.clf.predict(X_train))) |
50
|
|
|
print('=============================test============================') |
51
|
|
|
print(classification_report(y_test, self.clf.predict(X_test))) |
52
|
|
|
return self.clf |
53
|
|
|
|
54
|
1 |
|
def load_model(self, load_path: str): |
|
|
|
|
55
|
1 |
|
loading_prep = lambda string: f'model_dir/{load_path}/{string}' |
56
|
1 |
|
self.clf.load_model(loading_prep('model.json')) |
57
|
1 |
|
self.emb.load(loading_prep('emb.pkl')) |
58
|
1 |
|
with open(loading_prep('scaler.pkl'), 'rb') as f: |
|
|
|
|
59
|
1 |
|
self.scaler = pickle.load(f) |
60
|
|
|
|
61
|
1 |
|
def save_model(self, save_path: str): |
|
|
|
|
62
|
|
|
os.makedirs(f'model_dir/{save_path}', exist_ok=True) |
63
|
|
|
saving_prep = lambda string: f'model_dir/{save_path}/{string}' |
64
|
|
|
self.clf.save_model(saving_prep('model.json')) |
65
|
|
|
self.emb.save(saving_prep('emb.pkl')) |
66
|
|
|
with open(saving_prep('scaler.pkl'), 'wb') as f: |
|
|
|
|
67
|
|
|
pickle.dump(self.scaler, f, pickle.HIGHEST_PROTOCOL) |
68
|
|
|
|
69
|
1 |
|
def __getitem__(self, item: str) -> int: |
70
|
1 |
|
return self.predict(item) |
71
|
|
|
|
72
|
1 |
|
def predict(self, input_text: str) -> int: |
|
|
|
|
73
|
1 |
|
prep_text = remove_redundant_characters(remove_emoji(input_text)) |
74
|
1 |
|
vector = self.scaler.transform(self.emb.encode(prep_text).reshape(1, -1)) |
75
|
|
|
return self.clf.predict(vector)[0] |
76
|
|
|
|