e2edutch.predict.main() - Code Metrics - Inspection of "Predictor and download scripts" - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#18)

by Dafne van

created 2021-01-07 13:40 UTC

e2edutch.predict.main() D

↳ Parent: e2edutch.predict

Complexity

Conditions

Size

Total Lines	75
Code Lines	59

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	59
dl	0
loc	75
rs	4.2
c	0
b	0
f	0
cc	13
nop	1

How to fix Long Method Complexity

import sys

import json
import os
import io
import collections
import argparse
import logging

from e2edutch import conll
from e2edutch import minimize
from e2edutch import util
from e2edutch import coref_model as cm
from e2edutch import naf

import tensorflow as tf



class Predictor(object):

    def __init__(self, model_name='best', cfg_file=None):
        self.config = util.initialize_from_env(model_name, cfg_file)
        self.session = tf.compat.v1.Session()
        self.model = cm.CorefModel(self.config)
        self.model.restore(self.session)

    def predict(self, example):
        """
        Predict coreference spans for a tokenized text.


        Args:
            example (dict): dict with the following fields:
                              sentences ([[str]])
                              doc_id (str)
                              clusters ([[(int, int)]]) (optional)

        Returns:
            [[(int, int)]]: a list of clusters. The items of the cluster are
                            spans, denoted by their start end end token index

        """
        tensorized_example = self.model.tensorize_example(
            example, is_training=False)
        feed_dict = {i: t for i, t in zip(

            self.model.input_tensors, tensorized_example)}
        _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = self.session.run(

            self.model.predictions, feed_dict=feed_dict)
        predicted_antecedents = self.model.get_predicted_antecedents(
            top_antecedents, top_antecedent_scores)
        predicted_clusters, _ = self.model.get_predicted_clusters(
            top_span_starts, top_span_ends, predicted_antecedents)

        return predicted_clusters

    def end_session(self):

        self.session.close()


def get_parser():

    parser = argparse.ArgumentParser()
    parser.add_argument('config')
    parser.add_argument('input_filename')
    parser.add_argument('-o', '--output_file',
                        type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('-f', '--format_out', default='conll',
                        choices=['conll', 'jsonlines', 'naf'])
    parser.add_argument('-c', '--word_col', type=int, default=2)
    parser.add_argument('--cfg_file',
                        type=str,
                        default=None,
                        help="config file")
    parser.add_argument('-v', '--verbose', action='store_true')
    return parser


def read_jsonlines(input_filename):

    for line in open(input_filename).readlines():
        example = json.loads(line)
        yield example


def main(args=None):

    parser = get_parser()
    args = parser.parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    # config = util.initialize_from_env(args.config, args.cfg_file)

    # Input file in .jsonlines format or .conll.
    input_filename = args.input_filename

    ext_input = os.path.splitext(input_filename)[-1]
    if ext_input not in ['.conll', '.jsonlines', '.txt', '.naf']:
        raise Exception(
            'Input file should be .naf, .conll, .txt or .jsonlines, but is {}.'
            .format(ext_input))

    if ext_input == '.conll':
        labels = collections.defaultdict(set)
        stats = collections.defaultdict(int)
        docs = minimize.minimize_partition(
            input_filename, labels, stats, args.word_col)
    elif ext_input == '.jsonlines':
        docs = read_jsonlines(input_filename)
    elif ext_input == '.naf':
        naf_obj = naf.get_naf(input_filename)
        jsonlines_obj, term_ids, tok_ids = naf.get_jsonlines(naf_obj)

        docs = [jsonlines_obj]
    else:
        text = open(input_filename).read()
        docs = [util.create_example(text)]

    output_file = args.output_file
    predictor = Predictor(args.config, args.cfg_file)
    sentences = {}
    predictions = {}
    for example_num, example in enumerate(docs):
        # logging.info(example['doc_key'])
        example["predicted_clusters"], _ = predictor.predict(example)
        if args.format_out == 'jsonlines':
            output_file.write(json.dumps(example))
            output_file.write("\n")
        else:
            predictions[example['doc_key']] = example["predicted_clusters"]
            sentences[example['doc_key']] = example["sentences"]
        if example_num % 100 == 0:
            logging.info("Decoded {} examples.".format(example_num + 1))

    if args.format_out == 'conll':
        conll.output_conll(output_file, sentences, predictions)
    elif args.format_out == 'naf':
        # Check number of docs - what to do if multiple?
        # Create naf obj if input format was not naf
        if ext_input != '.naf':
            # To do: add linguistic processing layers for terms and tokens
            logging.warn(

                'Outputting NAF when input was not naf,'
                + 'no dependency information available')
            for doc_key in sentences:
                naf_obj, term_ids = naf.get_naf_from_sentences(
                    sentences[doc_key])
                naf_obj = naf.create_coref_layer(
                    naf_obj, predictions[doc_key], term_ids)
                naf_obj = naf.add_linguistic_processors(naf_obj)
                buffer = io.BytesIO()
                naf_obj.dump(buffer)
                output_file.write(buffer.getvalue().decode('utf-8'))
                # To do, make sepearate outputs?
                # TO do, use dependency information from conll?
        else:
            # We only have one input doc
            naf_obj = naf.create_coref_layer(
                naf_obj, example["predicted_clusters"], term_ids)

            naf_obj = naf.add_linguistic_processors(naf_obj)
            buffer = io.BytesIO()
            naf_obj.dump(buffer)
            output_file.write(buffer.getvalue().decode('utf-8'))


if __name__ == "__main__":
    main()


1			import sys
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import json
3			import os
4			import io
5			import collections
6			import argparse
7			import logging
8
9			from e2edutch import conll
10			from e2edutch import minimize
11			from e2edutch import util
12			from e2edutch import coref_model as cm
13			from e2edutch import naf
14
15			import tensorflow as tf
			0 ignored issues – show introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report third party import "import tensorflow as tf" should be placed before "from e2edutch import conll" Loading history...
16
17
18			class Predictor(object):
			0 ignored issues – show introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report Missing class docstring Loading history... introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report Class 'Predictor' inherits from object, can be safely removed from bases in python3 Loading history...
19			def __init__(self, model_name='best', cfg_file=None):
20			self.config = util.initialize_from_env(model_name, cfg_file)
21			self.session = tf.compat.v1.Session()
22			self.model = cm.CorefModel(self.config)
23			self.model.restore(self.session)
24
25			def predict(self, example):
26			"""
27			Predict coreference spans for a tokenized text.
28
29
30			Args:
31			example (dict): dict with the following fields:
32			sentences ([[str]])
33			doc_id (str)
34			clusters ([[(int, int)]]) (optional)
35
36			Returns:
37			[[(int, int)]]: a list of clusters. The items of the cluster are
38			spans, denoted by their start end end token index
39
40			"""
41			tensorized_example = self.model.tensorize_example(
42			example, is_training=False)
43			feed_dict = {i: t for i, t in zip(
			0 ignored issues – show Unused Code introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Unnecessary use of a comprehension Loading history...
44			self.model.input_tensors, tensorized_example)}
45			_, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = self.session.run(
			0 ignored issues – show Coding Style introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (107/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
46			self.model.predictions, feed_dict=feed_dict)
47			predicted_antecedents = self.model.get_predicted_antecedents(
48			top_antecedents, top_antecedent_scores)
49			predicted_clusters, _ = self.model.get_predicted_clusters(
50			top_span_starts, top_span_ends, predicted_antecedents)
51
52			return predicted_clusters
53
54			def end_session(self):
			0 ignored issues – show introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
55			self.session.close()
56
57
58			def get_parser():
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
59			parser = argparse.ArgumentParser()
60			parser.add_argument('config')
61			parser.add_argument('input_filename')
62			parser.add_argument('-o', '--output_file',
63			type=argparse.FileType('w'), default=sys.stdout)
64			parser.add_argument('-f', '--format_out', default='conll',
65			choices=['conll', 'jsonlines', 'naf'])
66			parser.add_argument('-c', '--word_col', type=int, default=2)
67			parser.add_argument('--cfg_file',
68			type=str,
69			default=None,
70			help="config file")
71			parser.add_argument('-v', '--verbose', action='store_true')
72			return parser
73
74
75			def read_jsonlines(input_filename):
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
76			for line in open(input_filename).readlines():
77			example = json.loads(line)
78			yield example
79
80
81			def main(args=None):
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Comprehensibility introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (21/15). Loading history...
82			parser = get_parser()
83			args = parser.parse_args()
84			if args.verbose:
85			logging.basicConfig(level=logging.DEBUG)
86			# config = util.initialize_from_env(args.config, args.cfg_file)
87
88			# Input file in .jsonlines format or .conll.
89			input_filename = args.input_filename
90
91			ext_input = os.path.splitext(input_filename)[-1]
92			if ext_input not in ['.conll', '.jsonlines', '.txt', '.naf']:
93			raise Exception(
94			'Input file should be .naf, .conll, .txt or .jsonlines, but is {}.'
95			.format(ext_input))
96
97			if ext_input == '.conll':
98			labels = collections.defaultdict(set)
99			stats = collections.defaultdict(int)
100			docs = minimize.minimize_partition(
101			input_filename, labels, stats, args.word_col)
102			elif ext_input == '.jsonlines':
103			docs = read_jsonlines(input_filename)
104			elif ext_input == '.naf':
105			naf_obj = naf.get_naf(input_filename)
106			jsonlines_obj, term_ids, tok_ids = naf.get_jsonlines(naf_obj)
			0 ignored issues – show Unused Code introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report The variable `tok_ids` seems to be unused. Loading history...
107			docs = [jsonlines_obj]
108			else:
109			text = open(input_filename).read()
110			docs = [util.create_example(text)]
111
112			output_file = args.output_file
113			predictor = Predictor(args.config, args.cfg_file)
114			sentences = {}
115			predictions = {}
116			for example_num, example in enumerate(docs):
117			# logging.info(example['doc_key'])
118			example["predicted_clusters"], _ = predictor.predict(example)
119			if args.format_out == 'jsonlines':
120			output_file.write(json.dumps(example))
121			output_file.write("\n")
122			else:
123			predictions[example['doc_key']] = example["predicted_clusters"]
124			sentences[example['doc_key']] = example["sentences"]
125			if example_num % 100 == 0:
126			logging.info("Decoded {} examples.".format(example_num + 1))
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
127			if args.format_out == 'conll':
128			conll.output_conll(output_file, sentences, predictions)
129			elif args.format_out == 'naf':
130			# Check number of docs - what to do if multiple?
131			# Create naf obj if input format was not naf
132			if ext_input != '.naf':
133			# To do: add linguistic processing layers for terms and tokens
134			logging.warn(
			0 ignored issues – show Coding Style Best Practice introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history... introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Using deprecated method warn() Loading history...
135			'Outputting NAF when input was not naf,'
136			+ 'no dependency information available')
137			for doc_key in sentences:
138			naf_obj, term_ids = naf.get_naf_from_sentences(
139			sentences[doc_key])
140			naf_obj = naf.create_coref_layer(
141			naf_obj, predictions[doc_key], term_ids)
142			naf_obj = naf.add_linguistic_processors(naf_obj)
143			buffer = io.BytesIO()
144			naf_obj.dump(buffer)
145			output_file.write(buffer.getvalue().decode('utf-8'))
146			# To do, make sepearate outputs?
147			# TO do, use dependency information from conll?
148			else:
149			# We only have one input doc
150			naf_obj = naf.create_coref_layer(
151			naf_obj, example["predicted_clusters"], term_ids)
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `example` does not seem to be defined in case the `for` loop on line `116` is not entered. Are you sure this can never be the case? Loading history... introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `term_ids` does not seem to be defined for all execution paths. Loading history... introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `naf_obj` does not seem to be defined for all execution paths. Loading history... Bug introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report The loop variable `example` might not be defined here. Loading history...
152			naf_obj = naf.add_linguistic_processors(naf_obj)
153			buffer = io.BytesIO()
154			naf_obj.dump(buffer)
155			output_file.write(buffer.getvalue().decode('utf-8'))
156
157
158			if __name__ == "__main__":
159			main()
160

Filter-Bubble / e2e-Dutch

Pull Request — master (#18)

e2edutch.predict.main() D

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like