patm.build_coherence.CoherenceFilesBuilder.create_cooc_files() - Code Metrics - boromir674/topic-modeling-toolkit - Measure and Improve Code Quality continuously with Scrutinizer

CoherenceFilesBuilder.create_cooc_files() A
last analyzed 2022-01-13 13:56 UTC

↳ Parent: patm.build_coherence

Complexity

Conditions

Size

Total Lines	23
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	10
nop	9
dl	0
loc	23
rs	9.9
c	0
b	0
f	0

How to fix Many Parameters

#!/usr/bin/env python

import os
import re
import in_place
import subprocess
from glob import glob

import logging

logger = logging.getLogger(__name__)


class CoherenceFilesBuilder:
    def __init__(self, collection_root):
        self._root = collection_root
        self._col_name = os.path.basename(self._root)
        try:
            self._vocab = self._glob("vocab*.txt")[0]
        except IndexError:
            raise VocabularyNotFoundError("Glob pattern '{}' did not match any file in '{}'".format("vocab*.txt", self._root))
        if self._col_name not in os.path.basename(self._vocab):
            logger.warning("{} Instead '{}' found.".format("Vocabulary file usually has the format 'vocab.{col_name}.txt.", os.path.basename(self._vocab)))

        self._splits = sorted([(re.search(r"vowpal\.{}-?([\w\-]*)\.txt".format(self._col_name), f).group(1), f) for f in self._glob("vowpal*.txt")], key=lambda x: x[0], reverse=True)
        if [_[0] for _ in self._splits] != [''] and [_[0] for _ in self._splits] != ['train', 'test']:
            raise InvalidSplitsError("Either 'train' and 'test' splits must be defined or a '' split no splitting; all dataset used for training)")

    def _glob(self, pattern):
        return glob("{}/{}".format(self._root, pattern))

    def _path(self, *args, **kwargs):
        return os.path.join(self._root, '_'.join(map(str, [_ for _ in args if _ != ''])) + (lambda x: '.'+x if x else '')(kwargs.get('extension', '')))

    def create_files(self, cooc_window=5, min_tf=0, min_df=0, apply_zero_index=True):
        _file = {}
        for s, vowpal in self._splits:
            _file.update({s: {'cooc_tf': self._path('cooc', min_tf, 'tf', s, extension='txt'),
                              'cooc_df': self._path('cooc', min_df, 'df', s, extension='txt'),
                              'ppmi_tf': self._path('ppmi', min_tf, 'tf', s, extension='txt'),
                              'ppmi_df': self._path('ppmi', min_df, 'df', s, extension='txt')}
                          })

            _ = self.create_cooc_files(vowpal, self._vocab,
                                       _file[s]['cooc_tf'], _file[s]['cooc_df'],
                                       _file[s]['ppmi_tf'], _file[s]['ppmi_df'],
                                       cooc_window=cooc_window, min_tf=min_tf, min_df=min_df)
            if _ == 0:
                print("Created ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
            else:
                print("Something went wrong when creating ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))

            if apply_zero_index:
                print("Applying zero indexing of tokens (ids)")
                for key, path in _file[s].items():
                    with in_place.InPlace(path, backup_ext='.bak') as fp:
                        for line in fp:
                            match_obj = re.search(r'^(\d+) (\d+) (.+)$', line).groups()
                            fp.write('{} {} {}\n'.format(int(match_obj[0])-1, int(match_obj[1])-1 , match_obj[2]))

    @staticmethod
    def create_cooc_files(vowpal_file, vocab_file, cooc_tf, cooc_df, ppmi_tf, ppmi_df, cooc_window=5, min_tf=0, min_df=0):
        """
        :param str vowpal_file: path to vowpal-formated bag-of-words file
        :param str vocab_file: path to uci-formated (list of unique tokens) vocabulary file
        :param str cooc_tf: file path to save the terms frequency (tf) dictionary of co-occurrences of every specific pair of tokens: total number of times a pair of tokens appears in the dataset
        :param str cooc_df: file path to save the document-frequency (df) dictionary with the number of documents in which each pair of tokens appeared: in how many documents pair 'x' can be found
        :param str ppmi_tf: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_tf dictionary
        :param str ppmi_df: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_df dictionary
        :param int min_tf: minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences
        :param int min_df: minimal value of documents in which a specific pair of tokens occurred together closely
        :param int cooc_window: number of tokens around specific token, which are used in calculation of cooccurrences
        :return:
        """
        # let everything flow into the terminal
        args = ['bigartm', '-c', vowpal_file, '-v', vocab_file, '--cooc-window', str(cooc_window),
                '--cooc-min-tf', str(min_tf), '--write-cooc-tf', cooc_tf,
                '--cooc-min-df', str(min_df), '--write-cooc-df', cooc_df,
                '--write-ppmi-tf', ppmi_tf,
                '--write-ppmi-df', ppmi_df,
                '--force']
        print('Executing:', ' '.join(args))
        return subprocess.call(args)


class VocabularyNotFoundError(Exception): pass
class InvalidSplitsError(Exception): pass


import click

@click.command()
@click.argument('collection')
@click.option('--window', '-w', default=5, show_default=True, help="number of tokens around specific token, which are used in calculation of cooccurrences")
@click.option('--cooc_min_tf', '-min_tf', default=0, show_default=True, help="minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences")
@click.option('--cooc_min_df', '-min_df', default=0, show_default=True, help="minimal value of documents in which a specific pair of tokens occurred together closely")
def main(collection, window, cooc_min_tf, cooc_min_df):
    cfb = CoherenceFilesBuilder(os.path.join(collections_root, collection))
    cfb.create_files(cooc_window=window, min_tf=cooc_min_tf, min_df=cooc_min_df)


if __name__ == '__main__':
    collections_root = "/data/thesis/data/collections"
    main()

1			#!/usr/bin/env python
2
3			import os
4			import re
5			import in_place
6			import subprocess
7			from glob import glob
8
9			import logging
10
11			logger = logging.getLogger(__name__)
12
13
14			class CoherenceFilesBuilder:
15			def __init__(self, collection_root):
16			self._root = collection_root
17			self._col_name = os.path.basename(self._root)
18			try:
19			self._vocab = self._glob("vocab*.txt")[0]
20			except IndexError:
21			raise VocabularyNotFoundError("Glob pattern '{}' did not match any file in '{}'".format("vocab*.txt", self._root))
22			if self._col_name not in os.path.basename(self._vocab):
23			logger.warning("{} Instead '{}' found.".format("Vocabulary file usually has the format 'vocab.{col_name}.txt.", os.path.basename(self._vocab)))
24
25			self._splits = sorted([(re.search(r"vowpal\.{}-?([\w\-])\.txt".format(self._col_name), f).group(1), f) for f in self._glob("vowpal.txt")], key=lambda x: x[0], reverse=True)
26			if [_[0] for _ in self._splits] != [''] and [_[0] for _ in self._splits] != ['train', 'test']:
27			raise InvalidSplitsError("Either 'train' and 'test' splits must be defined or a '' split no splitting; all dataset used for training)")
28
29			def _glob(self, pattern):
30			return glob("{}/{}".format(self._root, pattern))
31
32			def _path(self, args, *kwargs):
33			return os.path.join(self._root, '_'.join(map(str, [_ for _ in args if _ != ''])) + (lambda x: '.'+x if x else '')(kwargs.get('extension', '')))
34
35			def create_files(self, cooc_window=5, min_tf=0, min_df=0, apply_zero_index=True):
36			_file = {}
37			for s, vowpal in self._splits:
38			_file.update({s: {'cooc_tf': self._path('cooc', min_tf, 'tf', s, extension='txt'),
39			'cooc_df': self._path('cooc', min_df, 'df', s, extension='txt'),
40			'ppmi_tf': self._path('ppmi', min_tf, 'tf', s, extension='txt'),
41			'ppmi_df': self._path('ppmi', min_df, 'df', s, extension='txt')}
42			})
43
44			_ = self.create_cooc_files(vowpal, self._vocab,
45			_file[s]['cooc_tf'], _file[s]['cooc_df'],
46			_file[s]['ppmi_tf'], _file[s]['ppmi_df'],
47			cooc_window=cooc_window, min_tf=min_tf, min_df=min_df)
48			if _ == 0:
49			print("Created ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
50			else:
51			print("Something went wrong when creating ppmi files for '{}' split.".format((lambda x: 'all' if not x else x)(s)))
52
53			if apply_zero_index:
54			print("Applying zero indexing of tokens (ids)")
55			for key, path in _file[s].items():
56			with in_place.InPlace(path, backup_ext='.bak') as fp:
57			for line in fp:
58			match_obj = re.search(r'^(\d+) (\d+) (.+)$', line).groups()
59			fp.write('{} {} {}\n'.format(int(match_obj[0])-1, int(match_obj[1])-1 , match_obj[2]))
60
61			@staticmethod
62			def create_cooc_files(vowpal_file, vocab_file, cooc_tf, cooc_df, ppmi_tf, ppmi_df, cooc_window=5, min_tf=0, min_df=0):
63			"""
64			:param str vowpal_file: path to vowpal-formated bag-of-words file
65			:param str vocab_file: path to uci-formated (list of unique tokens) vocabulary file
66			:param str cooc_tf: file path to save the terms frequency (tf) dictionary of co-occurrences of every specific pair of tokens: total number of times a pair of tokens appears in the dataset
67			:param str cooc_df: file path to save the document-frequency (df) dictionary with the number of documents in which each pair of tokens appeared: in how many documents pair 'x' can be found
68			:param str ppmi_tf: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_tf dictionary
69			:param str ppmi_df: file path to save values of positive pmi (point-wise mutual information) of pairs of tokens from cooc_df dictionary
70			:param int min_tf: minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences
71			:param int min_df: minimal value of documents in which a specific pair of tokens occurred together closely
72			:param int cooc_window: number of tokens around specific token, which are used in calculation of cooccurrences
73			:return:
74			"""
75			# let everything flow into the terminal
76			args = ['bigartm', '-c', vowpal_file, '-v', vocab_file, '--cooc-window', str(cooc_window),
77			'--cooc-min-tf', str(min_tf), '--write-cooc-tf', cooc_tf,
78			'--cooc-min-df', str(min_df), '--write-cooc-df', cooc_df,
79			'--write-ppmi-tf', ppmi_tf,
80			'--write-ppmi-df', ppmi_df,
81			'--force']
82			print('Executing:', ' '.join(args))
83			return subprocess.call(args)
84
85
86			class VocabularyNotFoundError(Exception): pass
87			class InvalidSplitsError(Exception): pass
88
89
90			import click
91
92			@click.command()
93			@click.argument('collection')
94			@click.option('--window', '-w', default=5, show_default=True, help="number of tokens around specific token, which are used in calculation of cooccurrences")
95			@click.option('--cooc_min_tf', '-min_tf', default=0, show_default=True, help="minimal value of cooccurrences of a pair of tokens that are saved in dictionary of cooccurrences")
96			@click.option('--cooc_min_df', '-min_df', default=0, show_default=True, help="minimal value of documents in which a specific pair of tokens occurred together closely")
97			def main(collection, window, cooc_min_tf, cooc_min_df):
98			cfb = CoherenceFilesBuilder(os.path.join(collections_root, collection))
99			cfb.create_files(cooc_window=window, min_tf=cooc_min_tf, min_df=cooc_min_df)
100
101
102			if __name__ == '__main__':
103			collections_root = "/data/thesis/data/collections"
104			main()

boromir674 / topic-modeling-toolkit

CoherenceFilesBuilder.create_cooc_files() A last analyzed 2022-01-13 13:56 UTC

Complexity

Size

Duplication

Importance

How to fix Many Parameters

Many Parameters

Duplication Side-by-Side

Filter issues like

CoherenceFilesBuilder.create_cooc_files() A
last analyzed 2022-01-13 13:56 UTC