e2edutch.predict.read_jsonlines() - Code Metrics - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

e2edutch.predict.read_jsonlines() A
last analyzed 2021-08-05 09:00 UTC

↳ Parent: e2edutch.predict

Complexity

Conditions

Size

Total Lines	4
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	4
dl	0
loc	4
rs	10
c	0
b	0
f	0
cc	2
nop	1

import sys

import json
import os
import io
import collections
import argparse
import logging

from e2edutch import conll
from e2edutch import minimize
from e2edutch import util
from e2edutch import coref_model as cm
from e2edutch import naf

import tensorflow.compat.v1 as tf


logger = logging.getLogger('e2edutch')


class Predictor(object):

    """
    A predictor object loads a pretrained e2e model to predict coreferences.
    It can be used to predict coreferences on tokenized text.
    """

    def __init__(self, model_name='final', config=None, verbose=False):
        if verbose:
            logger.setLevel(logging.INFO)

        if config:
            self.config = config
        else:
            # if no configuration is provided, try to get a default config.
            self.config = util.initialize_from_env(model_name=model_name)

        # Clear tensorflow context:
        tf.reset_default_graph()
        self.session = tf.compat.v1.Session()

        try:
            self.model = cm.CorefModel(self.config)
            self.model.restore(self.session)
        except ValueError:
            raise Exception("Trying to reload the model while the previous " +
                            "session hasn't been ended. Close the existing " +
                            "session with predictor.end_session()")

    def predict(self, example):
        """
        Predict coreference spans for a tokenized text.


        Args:
            example (dict): dict with the following fields:
                              sentences ([[str]])
                              doc_id (str)
                              clusters ([[(int, int)]]) (optional)

        Returns:
            [[(int, int)]]: a list of clusters. The items of the cluster are
                            spans, denoted by their start end end token index

        """
        tensorized_example = self.model.tensorize_example(
            example, is_training=False)
        feed_dict = {i: t for i, t in zip(

            self.model.input_tensors, tensorized_example)}
        _, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = self.session.run(

            self.model.predictions, feed_dict=feed_dict)
        predicted_antecedents = self.model.get_predicted_antecedents(
            top_antecedents, top_antecedent_scores)
        predicted_clusters, _ = self.model.get_predicted_clusters(
            top_span_starts, top_span_ends, predicted_antecedents)

        return predicted_clusters

    def end_session(self):
        """
        Close the session, clearing the tensorflow model context.
        """
        self.session.close()
        tf.reset_default_graph()


def get_parser():

    parser = argparse.ArgumentParser()
    parser.add_argument('config')
    parser.add_argument('input_filename')
    parser.add_argument('-o', '--output_file',
                        type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('-f', '--format_out', default='conll',
                        choices=['conll', 'jsonlines', 'naf'])
    parser.add_argument('-c', '--word_col', type=int, default=2)
    parser.add_argument('--cfg_file',
                        type=str,
                        default=None,
                        help="config file")
    parser.add_argument('-v', '--verbose', action='store_true')
    return parser


def read_jsonlines(input_filename):

    for line in open(input_filename).readlines():
        example = json.loads(line)
        yield example


def main(args=None):

    parser = get_parser()
    args = parser.parse_args()
    if args.verbose:
        logger.setLevel(logging.INFO)

    # Input file in .jsonlines format or .conll.
    input_filename = args.input_filename

    ext_input = os.path.splitext(input_filename)[-1]
    if ext_input not in ['.conll', '.jsonlines', '.txt', '.naf']:
        raise Exception(
            'Input file should be .naf, .conll, .txt or .jsonlines, but is {}.'
            .format(ext_input))

    if ext_input == '.conll':
        labels = collections.defaultdict(set)
        stats = collections.defaultdict(int)
        docs = minimize.minimize_partition(
            input_filename, labels, stats, args.word_col)
    elif ext_input == '.jsonlines':
        docs = read_jsonlines(input_filename)
    elif ext_input == '.naf':
        naf_obj = naf.get_naf(input_filename)
        jsonlines_obj, term_ids, tok_ids = naf.get_jsonlines(naf_obj)

        docs = [jsonlines_obj]
    else:
        text = open(input_filename).read()
        docs = [util.create_example(text)]

    output_file = args.output_file

    config = util.initialize_from_env(cfg_file=args.cfg_file, model_cfg_file=args.config)
    predictor = Predictor(config=config)

    sentences = {}
    predictions = {}
    for example_num, example in enumerate(docs):
        example["predicted_clusters"], _ = predictor.predict(example)
        if args.format_out == 'jsonlines':
            output_file.write(json.dumps(example))
            output_file.write("\n")
        else:
            predictions[example['doc_key']] = example["predicted_clusters"]
            sentences[example['doc_key']] = example["sentences"]
        if example_num % 100 == 0:
            logger.info("Decoded {} examples.".format(example_num + 1))

    if args.format_out == 'conll':
        conll.output_conll(output_file, sentences, predictions)
    elif args.format_out == 'naf':
        # Check number of docs - what to do if multiple?
        # Create naf obj if input format was not naf
        if ext_input != '.naf':
            # To do: add linguistic processing layers for terms and tokens
            logger.warn(

                'Outputting NAF when input was not naf,'
                + 'no dependency information available')
            for doc_key in sentences:
                naf_obj, term_ids = naf.get_naf_from_sentences(
                    sentences[doc_key])
                naf_obj = naf.create_coref_layer(
                    naf_obj, predictions[doc_key], term_ids)
                naf_obj = naf.add_linguistic_processors(naf_obj)
                buffer = io.BytesIO()
                naf_obj.dump(buffer)
                output_file.write(buffer.getvalue().decode('utf-8'))
                # To do, make sepearate outputs?
                # TO do, use dependency information from conll?
        else:
            # We only have one input doc
            naf_obj = naf.create_coref_layer(
                naf_obj, example["predicted_clusters"], term_ids)

            naf_obj = naf.add_linguistic_processors(naf_obj)
            buffer = io.BytesIO()
            naf_obj.dump(buffer)
            output_file.write(buffer.getvalue().decode('utf-8'))


if __name__ == "__main__":
    main()


1			import sys
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import json
3			import os
4			import io
5			import collections
6			import argparse
7			import logging
8
9			from e2edutch import conll
10			from e2edutch import minimize
11			from e2edutch import util
12			from e2edutch import coref_model as cm
13			from e2edutch import naf
14
15			import tensorflow.compat.v1 as tf
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Unable to import 'tensorflow.compat.v1' Loading history... introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report third party import "import tensorflow.compat.v1 as tf" should be placed before "from e2edutch import conll" Loading history...
16
17			logger = logging.getLogger('e2edutch')
18
19
20			class Predictor(object):
			0 ignored issues – show introduced 2021-02-01 13:49 UTC by Report Bug Copy Issue Report Class 'Predictor' inherits from object, can be safely removed from bases in python3 Loading history...
21			"""
22			A predictor object loads a pretrained e2e model to predict coreferences.
23			It can be used to predict coreferences on tokenized text.
24			"""
25
26			def __init__(self, model_name='final', config=None, verbose=False):
27			if verbose:
28			logger.setLevel(logging.INFO)
29
30			if config:
31			self.config = config
32			else:
33			# if no configuration is provided, try to get a default config.
34			self.config = util.initialize_from_env(model_name=model_name)
35
36			# Clear tensorflow context:
37			tf.reset_default_graph()
38			self.session = tf.compat.v1.Session()
39
40			try:
41			self.model = cm.CorefModel(self.config)
42			self.model.restore(self.session)
43			except ValueError:
44			raise Exception("Trying to reload the model while the previous " +
45			"session hasn't been ended. Close the existing " +
46			"session with predictor.end_session()")
47
48			def predict(self, example):
49			"""
50			Predict coreference spans for a tokenized text.
51
52
53			Args:
54			example (dict): dict with the following fields:
55			sentences ([[str]])
56			doc_id (str)
57			clusters ([[(int, int)]]) (optional)
58
59			Returns:
60			[[(int, int)]]: a list of clusters. The items of the cluster are
61			spans, denoted by their start end end token index
62
63			"""
64			tensorized_example = self.model.tensorize_example(
65			example, is_training=False)
66			feed_dict = {i: t for i, t in zip(
			0 ignored issues – show Unused Code introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Unnecessary use of a comprehension Loading history...
67			self.model.input_tensors, tensorized_example)}
68			_, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores = self.session.run(
			0 ignored issues – show Coding Style introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (107/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
69			self.model.predictions, feed_dict=feed_dict)
70			predicted_antecedents = self.model.get_predicted_antecedents(
71			top_antecedents, top_antecedent_scores)
72			predicted_clusters, _ = self.model.get_predicted_clusters(
73			top_span_starts, top_span_ends, predicted_antecedents)
74
75			return predicted_clusters
76
77			def end_session(self):
78			"""
79			Close the session, clearing the tensorflow model context.
80			"""
81			self.session.close()
82			tf.reset_default_graph()
83
84
85			def get_parser():
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
86			parser = argparse.ArgumentParser()
87			parser.add_argument('config')
88			parser.add_argument('input_filename')
89			parser.add_argument('-o', '--output_file',
90			type=argparse.FileType('w'), default=sys.stdout)
91			parser.add_argument('-f', '--format_out', default='conll',
92			choices=['conll', 'jsonlines', 'naf'])
93			parser.add_argument('-c', '--word_col', type=int, default=2)
94			parser.add_argument('--cfg_file',
95			type=str,
96			default=None,
97			help="config file")
98			parser.add_argument('-v', '--verbose', action='store_true')
99			return parser
100
101
102			def read_jsonlines(input_filename):
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
103			for line in open(input_filename).readlines():
104			example = json.loads(line)
105			yield example
106
107
108			def main(args=None):
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Comprehensibility introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (22/15). Loading history...
109			parser = get_parser()
110			args = parser.parse_args()
111			if args.verbose:
112			logger.setLevel(logging.INFO)
113
114			# Input file in .jsonlines format or .conll.
115			input_filename = args.input_filename
116
117			ext_input = os.path.splitext(input_filename)[-1]
118			if ext_input not in ['.conll', '.jsonlines', '.txt', '.naf']:
119			raise Exception(
120			'Input file should be .naf, .conll, .txt or .jsonlines, but is {}.'
121			.format(ext_input))
122
123			if ext_input == '.conll':
124			labels = collections.defaultdict(set)
125			stats = collections.defaultdict(int)
126			docs = minimize.minimize_partition(
127			input_filename, labels, stats, args.word_col)
128			elif ext_input == '.jsonlines':
129			docs = read_jsonlines(input_filename)
130			elif ext_input == '.naf':
131			naf_obj = naf.get_naf(input_filename)
132			jsonlines_obj, term_ids, tok_ids = naf.get_jsonlines(naf_obj)
			0 ignored issues – show Unused Code introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report The variable `tok_ids` seems to be unused. Loading history...
133			docs = [jsonlines_obj]
134			else:
135			text = open(input_filename).read()
136			docs = [util.create_example(text)]
137
138			output_file = args.output_file
139
140			config = util.initialize_from_env(cfg_file=args.cfg_file, model_cfg_file=args.config)
141			predictor = Predictor(config=config)
142
143			sentences = {}
144			predictions = {}
145			for example_num, example in enumerate(docs):
146			example["predicted_clusters"], _ = predictor.predict(example)
147			if args.format_out == 'jsonlines':
148			output_file.write(json.dumps(example))
149			output_file.write("\n")
150			else:
151			predictions[example['doc_key']] = example["predicted_clusters"]
152			sentences[example['doc_key']] = example["sentences"]
153			if example_num % 100 == 0:
154			logger.info("Decoded {} examples.".format(example_num + 1))
			0 ignored issues – show introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
155			if args.format_out == 'conll':
156			conll.output_conll(output_file, sentences, predictions)
157			elif args.format_out == 'naf':
158			# Check number of docs - what to do if multiple?
159			# Create naf obj if input format was not naf
160			if ext_input != '.naf':
161			# To do: add linguistic processing layers for terms and tokens
162			logger.warn(
			0 ignored issues – show Coding Style Best Practice introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history... introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report Using deprecated method warn() Loading history...
163			'Outputting NAF when input was not naf,'
164			+ 'no dependency information available')
165			for doc_key in sentences:
166			naf_obj, term_ids = naf.get_naf_from_sentences(
167			sentences[doc_key])
168			naf_obj = naf.create_coref_layer(
169			naf_obj, predictions[doc_key], term_ids)
170			naf_obj = naf.add_linguistic_processors(naf_obj)
171			buffer = io.BytesIO()
172			naf_obj.dump(buffer)
173			output_file.write(buffer.getvalue().decode('utf-8'))
174			# To do, make sepearate outputs?
175			# TO do, use dependency information from conll?
176			else:
177			# We only have one input doc
178			naf_obj = naf.create_coref_layer(
179			naf_obj, example["predicted_clusters"], term_ids)
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `term_ids` does not seem to be defined for all execution paths. Loading history... Bug introduced 2020-11-23 10:55 UTC by Report Bug Copy Issue Report The loop variable `example` might not be defined here. Loading history... introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `naf_obj` does not seem to be defined for all execution paths. Loading history... introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `example` does not seem to be defined in case the `for` loop on line `145` is not entered. Are you sure this can never be the case? Loading history...
180			naf_obj = naf.add_linguistic_processors(naf_obj)
181			buffer = io.BytesIO()
182			naf_obj.dump(buffer)
183			output_file.write(buffer.getvalue().decode('utf-8'))
184
185
186			if __name__ == "__main__":
187			main()
188

Filter-Bubble / e2e-Dutch

e2edutch.predict.read_jsonlines() A last analyzed 2021-08-05 09:00 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

e2edutch.predict.read_jsonlines() A
last analyzed 2021-08-05 09:00 UTC