Total Complexity | 13 |
Total Lines | 53 |
Duplicated Lines | 79.25 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
1 | import pickle |
||
|
|||
2 | import gensim |
||
3 | import numpy as np |
||
4 | import pandas as pd |
||
5 | from gensim import utils |
||
6 | from sklearn.pipeline import Pipeline |
||
7 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
||
8 | from .w2v_corpus import W2VCorpus |
||
9 | |||
10 | |||
11 | View Code Duplication | class W2VEmb: |
|
12 | def __init__(self, text_document=None): |
||
13 | self.wv2_corpus = None |
||
14 | self.w2v_model = None |
||
15 | self.tf_idf_transformation = None |
||
16 | if text_document is not None: self.__init(text_document) |
||
17 | |||
18 | def __init(self, text_document: pd.Series): |
||
19 | text_document = text_document.fillna('') |
||
20 | self.tf_idf_transformation = self.tf_idf_transformer(text_document) |
||
21 | self.wv2_corpus = W2VCorpus(text_document) |
||
22 | self.w2v_model = gensim.models.Word2Vec(sentences=self.wv2_corpus, min_count=1, vector_size=900, epochs=50) |
||
23 | |||
24 | def __getitem__(self, text: str) -> np.ndarray: |
||
25 | try: return self.w2v_model.wv[text] |
||
1 ignored issue
–
show
|
|||
26 | except: return np.array([0 for _ in range(0, self.w2v_model.vector_size)]) |
||
27 | |||
28 | def tf_idf_transformer(self, text_series): |
||
29 | tfidf = Pipeline([('count', CountVectorizer(encoding='utf-8', min_df=3, #max_df=0.9, |
||
30 | max_features=900, |
||
31 | ngram_range=(1, 2))), |
||
32 | ('tfid', TfidfTransformer(sublinear_tf=True, norm='l2'))]).fit(text_series.ravel()) |
||
33 | return tfidf |
||
34 | |||
35 | def encode(self, text: str) -> np.ndarray: |
||
36 | stream = utils.simple_preprocess(text) |
||
37 | tf_idf_vec = self.tf_idf_transformation.transform(stream).toarray() |
||
38 | w2v_encode = self[stream] |
||
39 | return np.mean(list(self.tf_idf_mean(tf_idf_vec, w2v_encode)), axis=0) |
||
40 | |||
41 | def save(self, path: str): |
||
42 | with open(path, 'wb') as f: |
||
43 | pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL) |
||
44 | |||
45 | def load(self, path: str): |
||
46 | with open(path, 'rb') as f: |
||
47 | self.__dict__.update(pickle.load(f).__dict__) |
||
48 | |||
49 | @staticmethod |
||
50 | def tf_idf_mean(tf_idf_vec: np.ndarray, w2v_encode: np.ndarray): |
||
51 | for ind in range(len(tf_idf_vec)): |
||
52 | yield tf_idf_vec[ind]*w2v_encode[ind] |
||
53 |