1
|
|
|
#!/usr/bin/env python |
2
|
|
|
|
3
|
|
|
import os |
4
|
|
|
import re |
5
|
|
|
import in_place |
6
|
|
|
import subprocess |
7
|
|
|
from glob import glob |
8
|
|
|
|
9
|
|
|
import logging |
10
|
|
|
|
11
|
|
|
logger = logging.getLogger(__name__) |
12
|
|
|
|
13
|
|
|
|
14
|
|
|
class CoherenceFilesBuilder: |
15
|
|
|
def __init__(self, collection_root): |
16
|
|
|
self._root = collection_root |
17
|
|
|
self._col_name = os.path.basename(self._root) |
18
|
|
|
try: |
19
|
|
|
self._vocab = self._glob("vocab*.txt")[0] |
20
|
|
|
except IndexError: |
21
|
|
|
raise VocabularyNotFoundError("Glob pattern '{}' did not match any file in '{}'".format("vocab*.txt", self._root)) |
22
|
|
|
if self._col_name not in os.path.basename(self._vocab): |
23
|
|
|
logger.warning("{} Instead '{}' found.".format("Vocabulary file usually has the format 'vocab.{col_name}.txt.", os.path.basename(self._vocab))) |
24
|
|
|
|
25
|
|
|
self._splits = sorted([(re.search(r"vowpal\.{}-?([\w\-]*)\.txt".format(self._col_name), f).group(1), f) for f in self._glob("vowpal*.txt")], key=lambda x: x[0], reverse=True) |
26
|
|
|
if [_[0] for _ in self._splits] != [''] and [_[0] for _ in self._splits] != ['train', 'test']: |
27
|
|
|
raise InvalidSplitsError("Either 'train' and 'test' splits must be defined or a '' split no splitting; all dataset used for training)") |
28
|
|
|
|
29
|
|
|
def _glob(self, pattern): |
30
|
|
|
return glob("{}/{}".format(self._root, pattern)) |
31
|
|
|
|
32
|
|
|
def _path(self, *args, **kwargs): |
33
|
|
|
return os.path.join(self._root, '_'.join(map(str, [_ for _ in args if _ != ''])) + (lambda x: '.'+x if x else '')(kwargs.get('extension', ''))) |
34
|
|
|
|
35
|
|
|
def create_files(self, cooc_window=5, min_tf=0, min_df=0, apply_zero_index=True): |
36
|
|
|
_file = {} |
37
|
|
|
for s, vowpal in self._splits: |
38
|
|
|
_file.update({s: {'cooc_tf': self._path('cooc', min_tf, 'tf', s, extension='txt'), |
39
|
|
|
'cooc_df': self._path('cooc', min_df, 'df', s, extension='txt'), |
40
|
|
|
'ppmi_tf': self._path('ppmi', min_tf, 'tf', s, extension='txt'), |
41
|
|
|
'ppmi_df': self._path('ppmi', min_df, 'df', s, extension='txt')} |
42
|
|
|
}) |
43
|
|
|
|
44
|
|
|
_ = self.create_cooc_files(vowpal, self._vocab, |
45
|
|
|
_file[s]['cooc_tf'], _file[s]['cooc_df'], |
46
|
|
|
_file[s]['ppmi_tf'], _file[s]['ppmi_df'], |
47
|
|
|
cooc_window=cooc_window, min_tf=min_tf, min_df=min_df) |
48
|
|
|
if _ == 0: |
49
|
|
|
print("Created ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s))) |
50
|
|
|
else: |
51
|
|
|
print("Something went wrong when creating ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s))) |
52
|
|
|
|
53
|
|
|
if apply_zero_index: |
54
|
|
|
print("Applying zero indexing of tokens (ids)") |
55
|
|
|
for key, path in _file[s].items(): |
56
|
|
|
with in_place.InPlace(path, backup_ext='.bak') as fp: |
57
|
|
|
for line in fp: |
58
|
|
|
match_obj = re.search(r'^(\d+) (\d+) (.+)$', line).groups() |
59
|
|
|
fp.write('{} {} {}\n'.format(int(match_obj[0])-1, int(match_obj[1])-1 , match_obj[2])) |
60
|
|
|
|
61
|
|
|
@staticmethod |
62
|
|
|
def create_cooc_files(vowpal_file, vocab_file, cooc_tf, cooc_df, ppmi_tf, ppmi_df, cooc_window=5, min_tf=0, min_df=0): |
63
|
|
|
""" |
64
|
|
|
:param str vowpal_file: path to vowpal-formated bag-of-words file |
65
|
|
|
:param str vocab_file: path to uci-formated (list of unique tokens) vocabulary file |
66
|
|
|
:param str cooc_tf: file path to save the terms frequency (tf) dictionary of co-occurrences of every specific pair of tokens: total number of times a pair of tokens appears in the dataset |
67
|
|
|
:param str cooc_df: file path to save the document-frequency (df) dictionary with the number of documents in which each pair of tokens appeared: in how many documents pair 'x' can be found |
68
|
|
|
:param str ppmi_tf: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_tf dictionary |
69
|
|
|
:param str ppmi_df: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_df dictionary |
70
|
|
|
:param int min_tf: minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences |
71
|
|
|
:param int min_df: minimal value of documents in which a specific pair of tokens occurred together closely |
72
|
|
|
:param int cooc_window: number of tokens around specific token, which are used in calculation of cooccurrences |
73
|
|
|
:return: |
74
|
|
|
""" |
75
|
|
|
# let everything flow into the terminal |
76
|
|
|
args = ['bigartm', '-c', vowpal_file, '-v', vocab_file, '--cooc-window', str(cooc_window), |
77
|
|
|
'--cooc-min-tf', str(min_tf), '--write-cooc-tf', cooc_tf, |
78
|
|
|
'--cooc-min-df', str(min_df), '--write-cooc-df', cooc_df, |
79
|
|
|
'--write-ppmi-tf', ppmi_tf, |
80
|
|
|
'--write-ppmi-df', ppmi_df, |
81
|
|
|
'--force'] |
82
|
|
|
print('Executing:', ' '.join(args)) |
83
|
|
|
return subprocess.call(args) |
84
|
|
|
|
85
|
|
|
|
86
|
|
|
class VocabularyNotFoundError(Exception): pass |
87
|
|
|
class InvalidSplitsError(Exception): pass |
88
|
|
|
|
89
|
|
|
|
90
|
|
|
import click |
91
|
|
|
|
92
|
|
|
@click.command() |
93
|
|
|
@click.argument('collection') |
94
|
|
|
@click.option('--window', '-w', default=5, show_default=True, help="number of tokens around specific token, which are used in calculation of cooccurrences") |
95
|
|
|
@click.option('--cooc_min_tf', '-min_tf', default=0, show_default=True, help="minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences") |
96
|
|
|
@click.option('--cooc_min_df', '-min_df', default=0, show_default=True, help="minimal value of documents in which a specific pair of tokens occurred together closely") |
97
|
|
|
def main(collection, window, cooc_min_tf, cooc_min_df): |
98
|
|
|
cfb = CoherenceFilesBuilder(os.path.join(collections_root, collection)) |
99
|
|
|
cfb.create_files(cooc_window=window, min_tf=cooc_min_tf, min_df=cooc_min_df) |
100
|
|
|
|
101
|
|
|
|
102
|
|
|
if __name__ == '__main__': |
103
|
|
|
collections_root = "/data/thesis/data/collections" |
104
|
|
|
main() |