e2edutch.conll.evaluate_conll() - Code Metrics - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

e2edutch.conll.evaluate_conll() A
last analyzed 2021-08-05 09:00 UTC

↳ Parent: e2edutch.conll

Complexity

Conditions

Size

Total Lines	8
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	8
dl	0
loc	8
rs	10
c	0
b	0
f	0
cc	3
nop	3

import re

import tempfile
import subprocess
import operator
import collections

BEGIN_DOCUMENT_REGEX = re.compile(
    r"#begin document \(?([^\);]*)\)?;?(?: part (\d+))?")
COREF_RESULTS_REGEX = re.compile(
    r".*Coreference: Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\tPrecision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\tF1: ([0-9.]+)%.*", re.DOTALL)



def get_doc_key(doc_id, part=None):

    if part is None:

        return doc_id
    else:
        return '{}.p.{}'.format(doc_id, part)


def get_reverse_doc_key(doc_key):

    segments = doc_key.split('.p.')
    if len(segments) > 1:
        part = segments[-1]
        doc_id = '.p.'.join(segments[:-1])
    else:
        doc_id = doc_key
        part = None
    return doc_id, part


def get_prediction_map(predictions):

    prediction_map = {}
    for doc_key, clusters in predictions.items():
        start_map = collections.defaultdict(list)
        end_map = collections.defaultdict(list)
        word_map = collections.defaultdict(list)
        for cluster_id, mentions in enumerate(clusters):
            for start, end in mentions:
                if start == end:
                    word_map[start].append(cluster_id)
                else:
                    start_map[start].append((cluster_id, end))
                    end_map[end].append((cluster_id, start))
        for k, v in start_map.items():

            start_map[k] = [cluster_id for cluster_id, end in sorted(
                v, key=operator.itemgetter(1), reverse=True)]
        for k, v in end_map.items():

            end_map[k] = [cluster_id for cluster_id, start in sorted(
                v, key=operator.itemgetter(1), reverse=True)]
        prediction_map[doc_key] = (start_map, end_map, word_map)
    return prediction_map


def clusters_to_brackets(sentences, predictions):

    prediction_map = get_prediction_map({'': predictions})
    start_map, end_map, word_map = prediction_map['']
    word_index = 0
    brackets_list = []
    for sent in sentences:
        sent_brackets_list = []
        for i, word in enumerate(sent):

            coref_list = []
            if word_index in end_map:
                for cluster_id in end_map[word_index]:
                    coref_list.append("{})".format(cluster_id))
            if word_index in word_map:
                for cluster_id in word_map[word_index]:
                    coref_list.append("({})".format(cluster_id))
            if word_index in start_map:
                for cluster_id in start_map[word_index]:
                    coref_list.append("({}".format(cluster_id))
            coref = '-' if len(coref_list) == 0 else "|".join(coref_list)
            sent_brackets_list.append(coref)
            word_index += 1
        brackets_list.append(sent_brackets_list)
    return brackets_list


def output_conll(output_file, sentences, predictions):
    """
    Output the tokens and coreferences in CONLL-2012 format

    Args:
        output_file (File or IOBase): File to write the CONLL to
        sentences (dict): keys are the doc_keys, values are the sentences of
                          that doc
        predictions (dict): keys are the doc_keys, values are the predicted
                            clusters of that doc
    """
    for doc_key in sentences:
        brackets = clusters_to_brackets(sentences[doc_key], predictions[doc_key])
        doc_id, part = get_reverse_doc_key(doc_key)
        if part is None:
            output_file.write("#begin document ({});\n\n".format(doc_id))
        else:
            output_file.write(
                "#begin document ({}); part {}\n\n".format(
                    doc_id, part))
        for sent, brack_sent in zip(sentences[doc_key], brackets):
            for i, word in enumerate(sent):
                coref = brack_sent[i]
                line = '\t'.join([doc_id, str(i), word, coref])
                output_file.write(line + '\n')
            output_file.write('\n')
        output_file.write('#end document\n')


def output_conll_align(input_file, output_file, predictions):

    prediction_map = get_prediction_map(predictions)

    word_index = 0
    for line in input_file.readlines():
        row = line.split()
        if len(row) == 0:
            output_file.write("\n")
        elif row[0].startswith("#"):
            begin_match = re.match(BEGIN_DOCUMENT_REGEX, line)
            if begin_match:
                doc_key = get_doc_key(*begin_match.groups())
                start_map, end_map, word_map = prediction_map[doc_key]
                word_index = 0
            output_file.write(line)
            output_file.write("\n")
        else:
            coref_list = []
            if word_index in end_map:

                for cluster_id in end_map[word_index]:
                    coref_list.append("{})".format(cluster_id))
            if word_index in word_map:

                for cluster_id in word_map[word_index]:
                    coref_list.append("({})".format(cluster_id))
            if word_index in start_map:

                for cluster_id in start_map[word_index]:
                    coref_list.append("({}".format(cluster_id))

            if len(coref_list) == 0:
                row[-1] = "-"
            else:
                row[-1] = "|".join(coref_list)

            output_file.write("   ".join(row))
            output_file.write("\n")
            word_index += 1


def official_conll_eval(gold_path, predicted_path,

                        metric, official_stdout=False):
    cmd = ["conll-2012/scorer/v8.01/scorer.pl",
           metric, gold_path, predicted_path, "none"]
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    stdout, stderr = process.communicate()
    process.wait()

    stdout = stdout.decode("utf-8")
    if stderr is not None:
        print(stderr)

    if official_stdout:
        print("Official result for {}".format(metric))
        print(stdout)

    coref_results_match = re.match(COREF_RESULTS_REGEX, stdout)
    recall = float(coref_results_match.group(1))
    precision = float(coref_results_match.group(2))
    f1 = float(coref_results_match.group(3))

    return {"r": recall, "p": precision, "f": f1}


def evaluate_conll(gold_path, predictions, official_stdout=False):

    with tempfile.NamedTemporaryFile(delete=False, mode="w") as pred_file:
        with open(gold_path, "r") as gold_file:
            output_conll_align(gold_file, pred_file, predictions)
        print("Predicted conll file: {}".format(pred_file.name))
    return {m: official_conll_eval(
        gold_file.name, pred_file.name, m, official_stdout)
        for m in ("muc", "bcub", "ceafe")}



1			import re
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import tempfile
3			import subprocess
4			import operator
5			import collections
6
7			BEGIN_DOCUMENT_REGEX = re.compile(
8			r"#begin document \(?([^\);]*)\)?;?(?: part (\d+))?")
9			COREF_RESULTS_REGEX = re.compile(
10			r".Coreference: Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\tPrecision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\tF1: ([0-9.]+)%.", re.DOTALL)
			0 ignored issues – show Coding Style introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (137/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
11
12
13			def get_doc_key(doc_id, part=None):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
14			if part is None:
			0 ignored issues – show unused-code introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Unnecessary "else" after "return" Loading history...
15			return doc_id
16			else:
17			return '{}.p.{}'.format(doc_id, part)
18
19
20			def get_reverse_doc_key(doc_key):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
21			segments = doc_key.split('.p.')
22			if len(segments) > 1:
23			part = segments[-1]
24			doc_id = '.p.'.join(segments[:-1])
25			else:
26			doc_id = doc_key
27			part = None
28			return doc_id, part
29
30
31			def get_prediction_map(predictions):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
32			prediction_map = {}
33			for doc_key, clusters in predictions.items():
34			start_map = collections.defaultdict(list)
35			end_map = collections.defaultdict(list)
36			word_map = collections.defaultdict(list)
37			for cluster_id, mentions in enumerate(clusters):
38			for start, end in mentions:
39			if start == end:
40			word_map[start].append(cluster_id)
41			else:
42			start_map[start].append((cluster_id, end))
43			end_map[end].append((cluster_id, start))
44			for k, v in start_map.items():
			0 ignored issues – show Coding Style Naming introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
45			start_map[k] = [cluster_id for cluster_id, end in sorted(
46			v, key=operator.itemgetter(1), reverse=True)]
47			for k, v in end_map.items():
			0 ignored issues – show Coding Style Naming introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
48			end_map[k] = [cluster_id for cluster_id, start in sorted(
49			v, key=operator.itemgetter(1), reverse=True)]
50			prediction_map[doc_key] = (start_map, end_map, word_map)
51			return prediction_map
52
53
54			def clusters_to_brackets(sentences, predictions):
			0 ignored issues – show introduced 2021-01-08 14:57 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
55			prediction_map = get_prediction_map({'': predictions})
56			start_map, end_map, word_map = prediction_map['']
57			word_index = 0
58			brackets_list = []
59			for sent in sentences:
60			sent_brackets_list = []
61			for i, word in enumerate(sent):
			0 ignored issues – show Unused Code introduced 2021-01-08 14:57 UTC by Report Bug Copy Issue Report The variable `i` seems to be unused. Loading history... Unused Code introduced 2021-01-08 14:57 UTC by Report Bug Copy Issue Report The variable `word` seems to be unused. Loading history...
62			coref_list = []
63			if word_index in end_map:
64			for cluster_id in end_map[word_index]:
65			coref_list.append("{})".format(cluster_id))
66			if word_index in word_map:
67			for cluster_id in word_map[word_index]:
68			coref_list.append("({})".format(cluster_id))
69			if word_index in start_map:
70			for cluster_id in start_map[word_index]:
71			coref_list.append("({}".format(cluster_id))
72			coref = '-' if len(coref_list) == 0 else "\|".join(coref_list)
73			sent_brackets_list.append(coref)
74			word_index += 1
75			brackets_list.append(sent_brackets_list)
76			return brackets_list
77
78
79			def output_conll(output_file, sentences, predictions):
80			"""
81			Output the tokens and coreferences in CONLL-2012 format
82
83			Args:
84			output_file (File or IOBase): File to write the CONLL to
85			sentences (dict): keys are the doc_keys, values are the sentences of
86			that doc
87			predictions (dict): keys are the doc_keys, values are the predicted
88			clusters of that doc
89			"""
90			for doc_key in sentences:
91			brackets = clusters_to_brackets(sentences[doc_key], predictions[doc_key])
92			doc_id, part = get_reverse_doc_key(doc_key)
93			if part is None:
94			output_file.write("#begin document ({});\n\n".format(doc_id))
95			else:
96			output_file.write(
97			"#begin document ({}); part {}\n\n".format(
98			doc_id, part))
99			for sent, brack_sent in zip(sentences[doc_key], brackets):
100			for i, word in enumerate(sent):
101			coref = brack_sent[i]
102			line = '\t'.join([doc_id, str(i), word, coref])
103			output_file.write(line + '\n')
104			output_file.write('\n')
105			output_file.write('#end document\n')
106
107
108			def output_conll_align(input_file, output_file, predictions):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
109			prediction_map = get_prediction_map(predictions)
110
111			word_index = 0
112			for line in input_file.readlines():
113			row = line.split()
114			if len(row) == 0:
115			output_file.write("\n")
116			elif row[0].startswith("#"):
117			begin_match = re.match(BEGIN_DOCUMENT_REGEX, line)
118			if begin_match:
119			doc_key = get_doc_key(*begin_match.groups())
120			start_map, end_map, word_map = prediction_map[doc_key]
121			word_index = 0
122			output_file.write(line)
123			output_file.write("\n")
124			else:
125			coref_list = []
126			if word_index in end_map:
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `end_map` does not seem to be defined for all execution paths. Loading history...
127			for cluster_id in end_map[word_index]:
128			coref_list.append("{})".format(cluster_id))
129			if word_index in word_map:
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `word_map` does not seem to be defined for all execution paths. Loading history...
130			for cluster_id in word_map[word_index]:
131			coref_list.append("({})".format(cluster_id))
132			if word_index in start_map:
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `start_map` does not seem to be defined for all execution paths. Loading history...
133			for cluster_id in start_map[word_index]:
134			coref_list.append("({}".format(cluster_id))
135
136			if len(coref_list) == 0:
137			row[-1] = "-"
138			else:
139			row[-1] = "\|".join(coref_list)
140
141			output_file.write(" ".join(row))
142			output_file.write("\n")
143			word_index += 1
144
145
146			def official_conll_eval(gold_path, predicted_path,
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
147			metric, official_stdout=False):
148			cmd = ["conll-2012/scorer/v8.01/scorer.pl",
149			metric, gold_path, predicted_path, "none"]
150			process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
151			stdout, stderr = process.communicate()
152			process.wait()
153
154			stdout = stdout.decode("utf-8")
155			if stderr is not None:
156			print(stderr)
157
158			if official_stdout:
159			print("Official result for {}".format(metric))
160			print(stdout)
161
162			coref_results_match = re.match(COREF_RESULTS_REGEX, stdout)
163			recall = float(coref_results_match.group(1))
164			precision = float(coref_results_match.group(2))
165			f1 = float(coref_results_match.group(3))
			0 ignored issues – show Coding Style Naming introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Variable name "f1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
166			return {"r": recall, "p": precision, "f": f1}
167
168
169			def evaluate_conll(gold_path, predictions, official_stdout=False):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
170			with tempfile.NamedTemporaryFile(delete=False, mode="w") as pred_file:
171			with open(gold_path, "r") as gold_file:
172			output_conll_align(gold_file, pred_file, predictions)
173			print("Predicted conll file: {}".format(pred_file.name))
174			return {m: official_conll_eval(
175			gold_file.name, pred_file.name, m, official_stdout)
176			for m in ("muc", "bcub", "ceafe")}
			0 ignored issues – show Coding Style introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Wrong continued indentation (add 4 spaces). Loading history...
177

Filter-Bubble / e2e-Dutch

e2edutch.conll.evaluate_conll() A last analyzed 2021-08-05 09:00 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

e2edutch.conll.evaluate_conll() A
last analyzed 2021-08-05 09:00 UTC