1
|
1 |
|
import pickle |
|
|
|
|
2
|
1 |
|
import gensim |
3
|
1 |
|
import numpy as np |
4
|
1 |
|
import pandas as pd |
5
|
1 |
|
from gensim import utils |
6
|
1 |
|
from sklearn.pipeline import Pipeline |
7
|
1 |
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
8
|
1 |
|
from .w2v_corpus import W2VCorpus |
9
|
|
|
|
10
|
|
|
|
11
|
1 |
View Code Duplication |
class W2VEmb: |
|
|
|
|
12
|
1 |
|
def __init__(self, text_document=None): |
13
|
1 |
|
self.wv2_corpus = None |
14
|
1 |
|
self.w2v_model = None |
15
|
1 |
|
self.tf_idf_transformation = None |
16
|
1 |
|
if text_document is not None: self.__init(text_document) |
|
|
|
|
17
|
|
|
|
18
|
1 |
|
def __init(self, text_document: pd.Series): |
19
|
|
|
text_document = text_document.fillna('') |
20
|
|
|
self.tf_idf_transformation = self.tf_idf_transformer(text_document) |
21
|
|
|
self.wv2_corpus = W2VCorpus(text_document) |
22
|
|
|
self.w2v_model = gensim.models.Word2Vec(sentences=self.wv2_corpus, min_count=1, vector_size=900, epochs=50) |
|
|
|
|
23
|
|
|
|
24
|
1 |
|
def __getitem__(self, text: str) -> np.ndarray: |
25
|
1 |
|
try: return self.w2v_model.wv[text] |
|
|
|
|
26
|
1 |
|
except: return np.array([0 for _ in range(0, self.w2v_model.vector_size)]) |
|
|
|
|
27
|
|
|
|
28
|
1 |
|
def tf_idf_transformer(self, text_series): |
|
|
|
|
29
|
|
|
tfidf = Pipeline([('count', CountVectorizer(encoding='utf-8', min_df=3, #max_df=0.9, |
30
|
|
|
max_features=900, |
31
|
|
|
ngram_range=(1, 2))), |
32
|
|
|
('tfid', TfidfTransformer(sublinear_tf=True, norm='l2'))]).fit(text_series.ravel()) |
|
|
|
|
33
|
|
|
return tfidf |
34
|
|
|
|
35
|
1 |
|
def encode(self, text: str) -> np.ndarray: |
|
|
|
|
36
|
1 |
|
stream = utils.simple_preprocess(text) |
37
|
1 |
|
tf_idf_vec = self.tf_idf_transformation.transform(stream).toarray() |
38
|
1 |
|
w2v_encode = self[stream] |
39
|
1 |
|
return np.mean(list(self.tf_idf_mean(tf_idf_vec, w2v_encode)), axis=0) |
40
|
|
|
|
41
|
1 |
|
def save(self, path: str): |
|
|
|
|
42
|
|
|
with open(path, 'wb') as f: |
|
|
|
|
43
|
|
|
pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL) |
44
|
|
|
|
45
|
1 |
|
def load(self, path: str): |
|
|
|
|
46
|
1 |
|
with open(path, 'rb') as f: |
|
|
|
|
47
|
1 |
|
self.__dict__.update(pickle.load(f).__dict__) |
48
|
|
|
|
49
|
1 |
|
@staticmethod |
50
|
1 |
|
def tf_idf_mean(tf_idf_vec: np.ndarray, w2v_encode: np.ndarray): |
|
|
|
|
51
|
1 |
|
for ind in range(len(tf_idf_vec)): |
|
|
|
|
52
|
|
|
yield tf_idf_vec[ind]*w2v_encode[ind] |
53
|
|
|
|