| 1 |  |  | #!/usr/bin/env python | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import os | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import re | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import in_place | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import subprocess | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | from glob import glob | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | import logging | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | logger = logging.getLogger(__name__) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 14 |  |  | class CoherenceFilesBuilder: | 
            
                                                                        
                            
            
                                    
            
            
                | 15 |  |  |     def __init__(self, collection_root): | 
            
                                                                        
                            
            
                                    
            
            
                | 16 |  |  |         self._root = collection_root | 
            
                                                                        
                            
            
                                    
            
            
                | 17 |  |  |         self._col_name = os.path.basename(self._root) | 
            
                                                                        
                            
            
                                    
            
            
                | 18 |  |  |         try: | 
            
                                                                        
                            
            
                                    
            
            
                | 19 |  |  |             self._vocab = self._glob("vocab*.txt")[0] | 
            
                                                                        
                            
            
                                    
            
            
                | 20 |  |  |         except IndexError: | 
            
                                                                        
                            
            
                                    
            
            
                | 21 |  |  |             raise VocabularyNotFoundError("Glob pattern '{}' did not match any file in '{}'".format("vocab*.txt", self._root)) | 
            
                                                                        
                            
            
                                    
            
            
                | 22 |  |  |         if self._col_name not in os.path.basename(self._vocab): | 
            
                                                                        
                            
            
                                    
            
            
                | 23 |  |  |             logger.warning("{} Instead '{}' found.".format("Vocabulary file usually has the format 'vocab.{col_name}.txt.", os.path.basename(self._vocab))) | 
            
                                                                        
                            
            
                                    
            
            
                | 24 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 25 |  |  |         self._splits = sorted([(re.search(r"vowpal\.{}-?([\w\-]*)\.txt".format(self._col_name), f).group(1), f) for f in self._glob("vowpal*.txt")], key=lambda x: x[0], reverse=True) | 
            
                                                                        
                            
            
                                    
            
            
                | 26 |  |  |         if [_[0] for _ in self._splits] != [''] and [_[0] for _ in self._splits] != ['train', 'test']: | 
            
                                                                        
                            
            
                                    
            
            
                | 27 |  |  |             raise InvalidSplitsError("Either 'train' and 'test' splits must be defined or a '' split no splitting; all dataset used for training)") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     def _glob(self, pattern): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         return glob("{}/{}".format(self._root, pattern)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     def _path(self, *args, **kwargs): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |         return os.path.join(self._root, '_'.join(map(str, [_ for _ in args if _ != ''])) + (lambda x: '.'+x if x else '')(kwargs.get('extension', ''))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |     def create_files(self, cooc_window=5, min_tf=0, min_df=0, apply_zero_index=True): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         _file = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |         for s, vowpal in self._splits: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |             _file.update({s: {'cooc_tf': self._path('cooc', min_tf, 'tf', s, extension='txt'), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |                               'cooc_df': self._path('cooc', min_df, 'df', s, extension='txt'), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |                               'ppmi_tf': self._path('ppmi', min_tf, 'tf', s, extension='txt'), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |                               'ppmi_df': self._path('ppmi', min_df, 'df', s, extension='txt')} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |                           }) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |             _ = self.create_cooc_files(vowpal, self._vocab, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |                                        _file[s]['cooc_tf'], _file[s]['cooc_df'], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |                                        _file[s]['ppmi_tf'], _file[s]['ppmi_df'], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |                                        cooc_window=cooc_window, min_tf=min_tf, min_df=min_df) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |             if _ == 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |                 print("Created ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |             else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |                 print("Something went wrong when creating ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |             if apply_zero_index: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |                 print("Applying zero indexing of tokens (ids)") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |                 for key, path in _file[s].items(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |                     with in_place.InPlace(path, backup_ext='.bak') as fp: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |                         for line in fp: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |                             match_obj = re.search(r'^(\d+) (\d+) (.+)$', line).groups() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |                             fp.write('{} {} {}\n'.format(int(match_obj[0])-1, int(match_obj[1])-1 , match_obj[2])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |     @staticmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |     def create_cooc_files(vowpal_file, vocab_file, cooc_tf, cooc_df, ppmi_tf, ppmi_df, cooc_window=5, min_tf=0, min_df=0): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |         :param str vowpal_file: path to vowpal-formated bag-of-words file | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |         :param str vocab_file: path to uci-formated (list of unique tokens) vocabulary file | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |         :param str cooc_tf: file path to save the terms frequency (tf) dictionary of co-occurrences of every specific pair of tokens: total number of times a pair of tokens appears in the dataset | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |         :param str cooc_df: file path to save the document-frequency (df) dictionary with the number of documents in which each pair of tokens appeared: in how many documents pair 'x' can be found | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         :param str ppmi_tf: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_tf dictionary | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         :param str ppmi_df: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_df dictionary | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         :param int min_tf: minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |         :param int min_df: minimal value of documents in which a specific pair of tokens occurred together closely | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         :param int cooc_window: number of tokens around specific token, which are used in calculation of cooccurrences | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         :return: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         # let everything flow into the terminal | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         args = ['bigartm', '-c', vowpal_file, '-v', vocab_file, '--cooc-window', str(cooc_window), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |                 '--cooc-min-tf', str(min_tf), '--write-cooc-tf', cooc_tf, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |                 '--cooc-min-df', str(min_df), '--write-cooc-df', cooc_df, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |                 '--write-ppmi-tf', ppmi_tf, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |                 '--write-ppmi-df', ppmi_df, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |                 '--force'] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |         print('Executing:', ' '.join(args)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |         return subprocess.call(args) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  | class VocabularyNotFoundError(Exception): pass | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  | class InvalidSplitsError(Exception): pass | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  | import click | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  | @click.command() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  | @click.argument('collection') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  | @click.option('--window', '-w', default=5, show_default=True, help="number of tokens around specific token, which are used in calculation of cooccurrences") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  | @click.option('--cooc_min_tf', '-min_tf', default=0, show_default=True, help="minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  | @click.option('--cooc_min_df', '-min_df', default=0, show_default=True, help="minimal value of documents in which a specific pair of tokens occurred together closely") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  | def main(collection, window, cooc_min_tf, cooc_min_df): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |     cfb = CoherenceFilesBuilder(os.path.join(collections_root, collection)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |     cfb.create_files(cooc_window=window, min_tf=cooc_min_tf, min_df=cooc_min_df) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  | if __name__ == '__main__': | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 103 |  |  |     collections_root = "/data/thesis/data/collections" | 
            
                                                        
            
                                    
            
            
                | 104 |  |  |     main() |