| 
                    1
                 | 
                                    
                             1                          | 
                
                 | 
                import pickle  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    2
                 | 
                                    
                             1                          | 
                
                 | 
                import gensim  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    3
                 | 
                                    
                             1                          | 
                
                 | 
                import numpy as np  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    4
                 | 
                                    
                             1                          | 
                
                 | 
                import pandas as pd  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    5
                 | 
                                    
                             1                          | 
                
                 | 
                from gensim import utils  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    6
                 | 
                                    
                             1                          | 
                
                 | 
                from sklearn.pipeline import Pipeline  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    7
                 | 
                                    
                             1                          | 
                
                 | 
                from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    8
                 | 
                                    
                             1                          | 
                
                 | 
                from .w2v_corpus import W2VCorpus  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 
                    11
                 | 
                                    
                             1                          | 
                
                View Code Duplication | 
                class W2VEmb:  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                                                                        
                        
                         
                                                                                        
                                                                                            
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    12
                 | 
                                    
                             1                          | 
                
                 | 
                    def __init__(self, text_document=None):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    13
                 | 
                                    
                             1                          | 
                
                 | 
                        self.wv2_corpus = None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    14
                 | 
                                    
                             1                          | 
                
                 | 
                        self.w2v_model = None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    15
                 | 
                                    
                             1                          | 
                
                 | 
                        self.tf_idf_transformation = None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    16
                 | 
                                    
                             1                          | 
                
                 | 
                        if text_document is not None: self.__init(text_document)  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    18
                 | 
                                    
                             1                          | 
                
                 | 
                    def __init(self, text_document: pd.Series):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                        text_document = text_document.fillna('') | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                        self.tf_idf_transformation = self.tf_idf_transformer(text_document)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                        self.wv2_corpus = W2VCorpus(text_document)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                        self.w2v_model = gensim.models.Word2Vec(sentences=self.wv2_corpus, min_count=1, vector_size=900, epochs=50)  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    24
                 | 
                                    
                             1                          | 
                
                 | 
                    def __getitem__(self, text: str) -> np.ndarray:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    25
                 | 
                                    
                             1                          | 
                
                 | 
                        try:    return self.w2v_model.wv[text]  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                                                                        
                        
                         
                                                                                        
                                                                                            
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    26
                 | 
                                    
                             1                          | 
                
                 | 
                        except: return np.array([0 for _ in range(0, self.w2v_model.vector_size)])  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                                                                        
                        
                         
                                                                                        
                                                                                            
                                                                                     
                     | 
                
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    28
                 | 
                                    
                             1                          | 
                
                 | 
                    def tf_idf_transformer(self, text_series):  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                                                                        
                        
                         
                                                                                        
                                                                                            
                                                                                     
                     | 
                
            
                                                                        
                            
            
                                    
            
            
                | 
                    29
                 | 
                                    
                                                     | 
                
                 | 
                        tfidf = Pipeline([('count', CountVectorizer(encoding='utf-8', min_df=3, #max_df=0.9, | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    30
                 | 
                                    
                                                     | 
                
                 | 
                                                                    max_features=900,  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    31
                 | 
                                    
                                                     | 
                
                 | 
                                                                    ngram_range=(1, 2))),  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                                          ('tfid', TfidfTransformer(sublinear_tf=True, norm='l2'))]).fit(text_series.ravel()) | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                        
                            
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                        return tfidf  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    35
                 | 
                                    
                             1                          | 
                
                 | 
                    def encode(self, text: str) -> np.ndarray:  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    36
                 | 
                                    
                             1                          | 
                
                 | 
                        stream = utils.simple_preprocess(text)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    37
                 | 
                                    
                             1                          | 
                
                 | 
                        tf_idf_vec = self.tf_idf_transformation.transform(stream).toarray()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    38
                 | 
                                    
                             1                          | 
                
                 | 
                        w2v_encode = self[stream]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    39
                 | 
                                    
                             1                          | 
                
                 | 
                        return np.mean(list(self.tf_idf_mean(tf_idf_vec, w2v_encode)), axis=0)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    41
                 | 
                                    
                             1                          | 
                
                 | 
                    def save(self, path: str):  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                        with open(path, 'wb') as f:  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                            pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    45
                 | 
                                    
                             1                          | 
                
                 | 
                    def load(self, path: str):  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    46
                 | 
                                    
                             1                          | 
                
                 | 
                        with open(path, 'rb') as f:  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    47
                 | 
                                    
                             1                          | 
                
                 | 
                            self.__dict__.update(pickle.load(f).__dict__)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    48
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    49
                 | 
                                    
                             1                          | 
                
                 | 
                    @staticmethod  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    50
                 | 
                                    
                             1                          | 
                
                 | 
                    def tf_idf_mean(tf_idf_vec: np.ndarray, w2v_encode: np.ndarray):  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    51
                 | 
                                    
                             1                          | 
                
                 | 
                        for ind in range(len(tf_idf_vec)):  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                            yield tf_idf_vec[ind]*w2v_encode[ind]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                 |