| Total Complexity | 13 |
| Total Lines | 53 |
| Duplicated Lines | 79.25 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | import pickle |
||
|
|
|||
| 2 | import gensim |
||
| 3 | import numpy as np |
||
| 4 | import pandas as pd |
||
| 5 | from gensim import utils |
||
| 6 | from sklearn.pipeline import Pipeline |
||
| 7 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
||
| 8 | from .w2v_corpus import W2VCorpus |
||
| 9 | |||
| 10 | |||
| 11 | View Code Duplication | class W2VEmb: |
|
| 12 | def __init__(self, text_document=None): |
||
| 13 | self.wv2_corpus = None |
||
| 14 | self.w2v_model = None |
||
| 15 | self.tf_idf_transformation = None |
||
| 16 | if text_document is not None: self.__init(text_document) |
||
| 17 | |||
| 18 | def __init(self, text_document: pd.Series): |
||
| 19 | text_document = text_document.fillna('') |
||
| 20 | self.tf_idf_transformation = self.tf_idf_transformer(text_document) |
||
| 21 | self.wv2_corpus = W2VCorpus(text_document) |
||
| 22 | self.w2v_model = gensim.models.Word2Vec(sentences=self.wv2_corpus, min_count=1, vector_size=900, epochs=50) |
||
| 23 | |||
| 24 | def __getitem__(self, text: str) -> np.ndarray: |
||
| 25 | try: return self.w2v_model.wv[text] |
||
|
1 ignored issue
–
show
|
|||
| 26 | except: return np.array([0 for _ in range(0, self.w2v_model.vector_size)]) |
||
| 27 | |||
| 28 | def tf_idf_transformer(self, text_series): |
||
| 29 | tfidf = Pipeline([('count', CountVectorizer(encoding='utf-8', min_df=3, #max_df=0.9, |
||
| 30 | max_features=900, |
||
| 31 | ngram_range=(1, 2))), |
||
| 32 | ('tfid', TfidfTransformer(sublinear_tf=True, norm='l2'))]).fit(text_series.ravel()) |
||
| 33 | return tfidf |
||
| 34 | |||
| 35 | def encode(self, text: str) -> np.ndarray: |
||
| 36 | stream = utils.simple_preprocess(text) |
||
| 37 | tf_idf_vec = self.tf_idf_transformation.transform(stream).toarray() |
||
| 38 | w2v_encode = self[stream] |
||
| 39 | return np.mean(list(self.tf_idf_mean(tf_idf_vec, w2v_encode)), axis=0) |
||
| 40 | |||
| 41 | def save(self, path: str): |
||
| 42 | with open(path, 'wb') as f: |
||
| 43 | pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL) |
||
| 44 | |||
| 45 | def load(self, path: str): |
||
| 46 | with open(path, 'rb') as f: |
||
| 47 | self.__dict__.update(pickle.load(f).__dict__) |
||
| 48 | |||
| 49 | @staticmethod |
||
| 50 | def tf_idf_mean(tf_idf_vec: np.ndarray, w2v_encode: np.ndarray): |
||
| 51 | for ind in range(len(tf_idf_vec)): |
||
| 52 | yield tf_idf_vec[ind]*w2v_encode[ind] |
||
| 53 |