CoherenceFilesBuilder.create_cooc_files()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 23
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 10
nop 9
dl 0
loc 23
rs 9.9
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
#!/usr/bin/env python
2
3
import os
4
import re
5
import in_place
6
import subprocess
7
from glob import glob
8
9
import logging
10
11
logger = logging.getLogger(__name__)
12
13
14
class CoherenceFilesBuilder:
15
    def __init__(self, collection_root):
16
        self._root = collection_root
17
        self._col_name = os.path.basename(self._root)
18
        try:
19
            self._vocab = self._glob("vocab*.txt")[0]
20
        except IndexError:
21
            raise VocabularyNotFoundError("Glob pattern '{}' did not match any file in '{}'".format("vocab*.txt", self._root))
22
        if self._col_name not in os.path.basename(self._vocab):
23
            logger.warning("{} Instead '{}' found.".format("Vocabulary file usually has the format 'vocab.{col_name}.txt.", os.path.basename(self._vocab)))
24
25
        self._splits = sorted([(re.search(r"vowpal\.{}-?([\w\-]*)\.txt".format(self._col_name), f).group(1), f) for f in self._glob("vowpal*.txt")], key=lambda x: x[0], reverse=True)
26
        if [_[0] for _ in self._splits] != [''] and [_[0] for _ in self._splits] != ['train', 'test']:
27
            raise InvalidSplitsError("Either 'train' and 'test' splits must be defined or a '' split no splitting; all dataset used for training)")
28
29
    def _glob(self, pattern):
30
        return glob("{}/{}".format(self._root, pattern))
31
32
    def _path(self, *args, **kwargs):
33
        return os.path.join(self._root, '_'.join(map(str, [_ for _ in args if _ != ''])) + (lambda x: '.'+x if x else '')(kwargs.get('extension', '')))
34
35
    def create_files(self, cooc_window=5, min_tf=0, min_df=0, apply_zero_index=True):
36
        _file = {}
37
        for s, vowpal in self._splits:
38
            _file.update({s: {'cooc_tf': self._path('cooc', min_tf, 'tf', s, extension='txt'),
39
                              'cooc_df': self._path('cooc', min_df, 'df', s, extension='txt'),
40
                              'ppmi_tf': self._path('ppmi', min_tf, 'tf', s, extension='txt'),
41
                              'ppmi_df': self._path('ppmi', min_df, 'df', s, extension='txt')}
42
                          })
43
44
            _ = self.create_cooc_files(vowpal, self._vocab,
45
                                       _file[s]['cooc_tf'], _file[s]['cooc_df'],
46
                                       _file[s]['ppmi_tf'], _file[s]['ppmi_df'],
47
                                       cooc_window=cooc_window, min_tf=min_tf, min_df=min_df)
48
            if _ == 0:
49
                print("Created ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
50
            else:
51
                print("Something went wrong when creating ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
52
53
            if apply_zero_index:
54
                print("Applying zero indexing of tokens (ids)")
55
                for key, path in _file[s].items():
56
                    with in_place.InPlace(path, backup_ext='.bak') as fp:
57
                        for line in fp:
58
                            match_obj = re.search(r'^(\d+) (\d+) (.+)$', line).groups()
59
                            fp.write('{} {} {}\n'.format(int(match_obj[0])-1, int(match_obj[1])-1 , match_obj[2]))
60
61
    @staticmethod
62
    def create_cooc_files(vowpal_file, vocab_file, cooc_tf, cooc_df, ppmi_tf, ppmi_df, cooc_window=5, min_tf=0, min_df=0):
63
        """
64
        :param str vowpal_file: path to vowpal-formated bag-of-words file
65
        :param str vocab_file: path to uci-formated (list of unique tokens) vocabulary file
66
        :param str cooc_tf: file path to save the terms frequency (tf) dictionary of co-occurrences of every specific pair of tokens: total number of times a pair of tokens appears in the dataset
67
        :param str cooc_df: file path to save the document-frequency (df) dictionary with the number of documents in which each pair of tokens appeared: in how many documents pair 'x' can be found
68
        :param str ppmi_tf: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_tf dictionary
69
        :param str ppmi_df: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_df dictionary
70
        :param int min_tf: minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences
71
        :param int min_df: minimal value of documents in which a specific pair of tokens occurred together closely
72
        :param int cooc_window: number of tokens around specific token, which are used in calculation of cooccurrences
73
        :return:
74
        """
75
        # let everything flow into the terminal
76
        args = ['bigartm', '-c', vowpal_file, '-v', vocab_file, '--cooc-window', str(cooc_window),
77
                '--cooc-min-tf', str(min_tf), '--write-cooc-tf', cooc_tf,
78
                '--cooc-min-df', str(min_df), '--write-cooc-df', cooc_df,
79
                '--write-ppmi-tf', ppmi_tf,
80
                '--write-ppmi-df', ppmi_df,
81
                '--force']
82
        print('Executing:', ' '.join(args))
83
        return subprocess.call(args)
84
85
86
class VocabularyNotFoundError(Exception): pass
87
class InvalidSplitsError(Exception): pass
88
89
90
import click
91
92
@click.command()
93
@click.argument('collection')
94
@click.option('--window', '-w', default=5, show_default=True, help="number of tokens around specific token, which are used in calculation of cooccurrences")
95
@click.option('--cooc_min_tf', '-min_tf', default=0, show_default=True, help="minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences")
96
@click.option('--cooc_min_df', '-min_df', default=0, show_default=True, help="minimal value of documents in which a specific pair of tokens occurred together closely")
97
def main(collection, window, cooc_min_tf, cooc_min_df):
98
    cfb = CoherenceFilesBuilder(os.path.join(collections_root, collection))
99
    cfb.create_files(cooc_window=window, min_tf=cooc_min_tf, min_df=cooc_min_df)
100
101
102
if __name__ == '__main__':
103
    collections_root = "/data/thesis/data/collections"
104
    main()