CoherenceFilesBuilder.create_files()   C
last analyzed

Complexity

Conditions 11

Size

Total Lines 25
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 11
eloc 21
nop 5
dl 0
loc 25
rs 5.4
c 0
b 0
f 0

How to fix   Complexity   

Complexity

Complex classes like patm.build_coherence.CoherenceFilesBuilder.create_files() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#!/usr/bin/env python
2
3
import os
4
import re
5
import in_place
6
import subprocess
7
from glob import glob
8
9
import logging
10
11
logger = logging.getLogger(__name__)
12
13
14
class CoherenceFilesBuilder:
15
    def __init__(self, collection_root):
16
        self._root = collection_root
17
        self._col_name = os.path.basename(self._root)
18
        try:
19
            self._vocab = self._glob("vocab*.txt")[0]
20
        except IndexError:
21
            raise VocabularyNotFoundError("Glob pattern '{}' did not match any file in '{}'".format("vocab*.txt", self._root))
22
        if self._col_name not in os.path.basename(self._vocab):
23
            logger.warning("{} Instead '{}' found.".format("Vocabulary file usually has the format 'vocab.{col_name}.txt.", os.path.basename(self._vocab)))
24
25
        self._splits = sorted([(re.search(r"vowpal\.{}-?([\w\-]*)\.txt".format(self._col_name), f).group(1), f) for f in self._glob("vowpal*.txt")], key=lambda x: x[0], reverse=True)
26
        if [_[0] for _ in self._splits] != [''] and [_[0] for _ in self._splits] != ['train', 'test']:
27
            raise InvalidSplitsError("Either 'train' and 'test' splits must be defined or a '' split no splitting; all dataset used for training)")
28
29
    def _glob(self, pattern):
30
        return glob("{}/{}".format(self._root, pattern))
31
32
    def _path(self, *args, **kwargs):
33
        return os.path.join(self._root, '_'.join(map(str, [_ for _ in args if _ != ''])) + (lambda x: '.'+x if x else '')(kwargs.get('extension', '')))
34
35
    def create_files(self, cooc_window=5, min_tf=0, min_df=0, apply_zero_index=True):
36
        _file = {}
37
        for s, vowpal in self._splits:
38
            _file.update({s: {'cooc_tf': self._path('cooc', min_tf, 'tf', s, extension='txt'),
39
                              'cooc_df': self._path('cooc', min_df, 'df', s, extension='txt'),
40
                              'ppmi_tf': self._path('ppmi', min_tf, 'tf', s, extension='txt'),
41
                              'ppmi_df': self._path('ppmi', min_df, 'df', s, extension='txt')}
42
                          })
43
44
            _ = self.create_cooc_files(vowpal, self._vocab,
45
                                       _file[s]['cooc_tf'], _file[s]['cooc_df'],
46
                                       _file[s]['ppmi_tf'], _file[s]['ppmi_df'],
47
                                       cooc_window=cooc_window, min_tf=min_tf, min_df=min_df)
48
            if _ == 0:
49
                print("Created ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
50
            else:
51
                print("Something went wrong when creating ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
52
53
            if apply_zero_index:
54
                print("Applying zero indexing of tokens (ids)")
55
                for key, path in _file[s].items():
56
                    with in_place.InPlace(path, backup_ext='.bak') as fp:
57
                        for line in fp:
58
                            match_obj = re.search(r'^(\d+) (\d+) (.+)$', line).groups()
59
                            fp.write('{} {} {}\n'.format(int(match_obj[0])-1, int(match_obj[1])-1 , match_obj[2]))
60
61
    @staticmethod
62
    def create_cooc_files(vowpal_file, vocab_file, cooc_tf, cooc_df, ppmi_tf, ppmi_df, cooc_window=5, min_tf=0, min_df=0):
63
        """
64
        :param str vowpal_file: path to vowpal-formated bag-of-words file
65
        :param str vocab_file: path to uci-formated (list of unique tokens) vocabulary file
66
        :param str cooc_tf: file path to save the terms frequency (tf) dictionary of co-occurrences of every specific pair of tokens: total number of times a pair of tokens appears in the dataset
67
        :param str cooc_df: file path to save the document-frequency (df) dictionary with the number of documents in which each pair of tokens appeared: in how many documents pair 'x' can be found
68
        :param str ppmi_tf: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_tf dictionary
69
        :param str ppmi_df: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_df dictionary
70
        :param int min_tf: minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences
71
        :param int min_df: minimal value of documents in which a specific pair of tokens occurred together closely
72
        :param int cooc_window: number of tokens around specific token, which are used in calculation of cooccurrences
73
        :return:
74
        """
75
        # let everything flow into the terminal
76
        args = ['bigartm', '-c', vowpal_file, '-v', vocab_file, '--cooc-window', str(cooc_window),
77
                '--cooc-min-tf', str(min_tf), '--write-cooc-tf', cooc_tf,
78
                '--cooc-min-df', str(min_df), '--write-cooc-df', cooc_df,
79
                '--write-ppmi-tf', ppmi_tf,
80
                '--write-ppmi-df', ppmi_df,
81
                '--force']
82
        print('Executing:', ' '.join(args))
83
        return subprocess.call(args)
84
85
86
class VocabularyNotFoundError(Exception): pass
87
class InvalidSplitsError(Exception): pass
88
89
90
import click
91
92
@click.command()
93
@click.argument('collection')
94
@click.option('--window', '-w', default=5, show_default=True, help="number of tokens around specific token, which are used in calculation of cooccurrences")
95
@click.option('--cooc_min_tf', '-min_tf', default=0, show_default=True, help="minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences")
96
@click.option('--cooc_min_df', '-min_df', default=0, show_default=True, help="minimal value of documents in which a specific pair of tokens occurred together closely")
97
def main(collection, window, cooc_min_tf, cooc_min_df):
98
    cfb = CoherenceFilesBuilder(os.path.join(collections_root, collection))
99
    cfb.create_files(cooc_window=window, min_tf=cooc_min_tf, min_df=cooc_min_df)
100
101
102
if __name__ == '__main__':
103
    collections_root = "/data/thesis/data/collections"
104
    main()