e2edutch.conll.evaluate_conll()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 8
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 8
rs 10
c 0
b 0
f 0
cc 3
nop 3
1
import re
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
import tempfile
3
import subprocess
4
import operator
5
import collections
6
7
BEGIN_DOCUMENT_REGEX = re.compile(
8
    r"#begin document \(?([^\);]*)\)?;?(?: part (\d+))?")
9
COREF_RESULTS_REGEX = re.compile(
10
    r".*Coreference: Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\tPrecision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\tF1: ([0-9.]+)%.*", re.DOTALL)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (137/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
11
12
13
def get_doc_key(doc_id, part=None):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
14
    if part is None:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
15
        return doc_id
16
    else:
17
        return '{}.p.{}'.format(doc_id, part)
18
19
20
def get_reverse_doc_key(doc_key):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
21
    segments = doc_key.split('.p.')
22
    if len(segments) > 1:
23
        part = segments[-1]
24
        doc_id = '.p.'.join(segments[:-1])
25
    else:
26
        doc_id = doc_key
27
        part = None
28
    return doc_id, part
29
30
31
def get_prediction_map(predictions):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
32
    prediction_map = {}
33
    for doc_key, clusters in predictions.items():
34
        start_map = collections.defaultdict(list)
35
        end_map = collections.defaultdict(list)
36
        word_map = collections.defaultdict(list)
37
        for cluster_id, mentions in enumerate(clusters):
38
            for start, end in mentions:
39
                if start == end:
40
                    word_map[start].append(cluster_id)
41
                else:
42
                    start_map[start].append((cluster_id, end))
43
                    end_map[end].append((cluster_id, start))
44
        for k, v in start_map.items():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
45
            start_map[k] = [cluster_id for cluster_id, end in sorted(
46
                v, key=operator.itemgetter(1), reverse=True)]
47
        for k, v in end_map.items():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
48
            end_map[k] = [cluster_id for cluster_id, start in sorted(
49
                v, key=operator.itemgetter(1), reverse=True)]
50
        prediction_map[doc_key] = (start_map, end_map, word_map)
51
    return prediction_map
52
53
54
def clusters_to_brackets(sentences, predictions):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
55
    prediction_map = get_prediction_map({'': predictions})
56
    start_map, end_map, word_map = prediction_map['']
57
    word_index = 0
58
    brackets_list = []
59
    for sent in sentences:
60
        sent_brackets_list = []
61
        for i, word in enumerate(sent):
0 ignored issues
show
Unused Code introduced by
The variable i seems to be unused.
Loading history...
Unused Code introduced by
The variable word seems to be unused.
Loading history...
62
            coref_list = []
63
            if word_index in end_map:
64
                for cluster_id in end_map[word_index]:
65
                    coref_list.append("{})".format(cluster_id))
66
            if word_index in word_map:
67
                for cluster_id in word_map[word_index]:
68
                    coref_list.append("({})".format(cluster_id))
69
            if word_index in start_map:
70
                for cluster_id in start_map[word_index]:
71
                    coref_list.append("({}".format(cluster_id))
72
            coref = '-' if len(coref_list) == 0 else "|".join(coref_list)
73
            sent_brackets_list.append(coref)
74
            word_index += 1
75
        brackets_list.append(sent_brackets_list)
76
    return brackets_list
77
78
79
def output_conll(output_file, sentences, predictions):
80
    """
81
    Output the tokens and coreferences in CONLL-2012 format
82
83
    Args:
84
        output_file (File or IOBase): File to write the CONLL to
85
        sentences (dict): keys are the doc_keys, values are the sentences of
86
                          that doc
87
        predictions (dict): keys are the doc_keys, values are the predicted
88
                            clusters of that doc
89
    """
90
    for doc_key in sentences:
91
        brackets = clusters_to_brackets(sentences[doc_key], predictions[doc_key])
92
        doc_id, part = get_reverse_doc_key(doc_key)
93
        if part is None:
94
            output_file.write("#begin document ({});\n\n".format(doc_id))
95
        else:
96
            output_file.write(
97
                "#begin document ({}); part {}\n\n".format(
98
                    doc_id, part))
99
        for sent, brack_sent in zip(sentences[doc_key], brackets):
100
            for i, word in enumerate(sent):
101
                coref = brack_sent[i]
102
                line = '\t'.join([doc_id, str(i), word, coref])
103
                output_file.write(line + '\n')
104
            output_file.write('\n')
105
        output_file.write('#end document\n')
106
107
108
def output_conll_align(input_file, output_file, predictions):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
109
    prediction_map = get_prediction_map(predictions)
110
111
    word_index = 0
112
    for line in input_file.readlines():
113
        row = line.split()
114
        if len(row) == 0:
115
            output_file.write("\n")
116
        elif row[0].startswith("#"):
117
            begin_match = re.match(BEGIN_DOCUMENT_REGEX, line)
118
            if begin_match:
119
                doc_key = get_doc_key(*begin_match.groups())
120
                start_map, end_map, word_map = prediction_map[doc_key]
121
                word_index = 0
122
            output_file.write(line)
123
            output_file.write("\n")
124
        else:
125
            coref_list = []
126
            if word_index in end_map:
0 ignored issues
show
introduced by
The variable end_map does not seem to be defined for all execution paths.
Loading history...
127
                for cluster_id in end_map[word_index]:
128
                    coref_list.append("{})".format(cluster_id))
129
            if word_index in word_map:
0 ignored issues
show
introduced by
The variable word_map does not seem to be defined for all execution paths.
Loading history...
130
                for cluster_id in word_map[word_index]:
131
                    coref_list.append("({})".format(cluster_id))
132
            if word_index in start_map:
0 ignored issues
show
introduced by
The variable start_map does not seem to be defined for all execution paths.
Loading history...
133
                for cluster_id in start_map[word_index]:
134
                    coref_list.append("({}".format(cluster_id))
135
136
            if len(coref_list) == 0:
137
                row[-1] = "-"
138
            else:
139
                row[-1] = "|".join(coref_list)
140
141
            output_file.write("   ".join(row))
142
            output_file.write("\n")
143
            word_index += 1
144
145
146
def official_conll_eval(gold_path, predicted_path,
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
147
                        metric, official_stdout=False):
148
    cmd = ["conll-2012/scorer/v8.01/scorer.pl",
149
           metric, gold_path, predicted_path, "none"]
150
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
151
    stdout, stderr = process.communicate()
152
    process.wait()
153
154
    stdout = stdout.decode("utf-8")
155
    if stderr is not None:
156
        print(stderr)
157
158
    if official_stdout:
159
        print("Official result for {}".format(metric))
160
        print(stdout)
161
162
    coref_results_match = re.match(COREF_RESULTS_REGEX, stdout)
163
    recall = float(coref_results_match.group(1))
164
    precision = float(coref_results_match.group(2))
165
    f1 = float(coref_results_match.group(3))
0 ignored issues
show
Coding Style Naming introduced by
Variable name "f1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
166
    return {"r": recall, "p": precision, "f": f1}
167
168
169
def evaluate_conll(gold_path, predictions, official_stdout=False):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
170
    with tempfile.NamedTemporaryFile(delete=False, mode="w") as pred_file:
171
        with open(gold_path, "r") as gold_file:
172
            output_conll_align(gold_file, pred_file, predictions)
173
        print("Predicted conll file: {}".format(pred_file.name))
174
    return {m: official_conll_eval(
175
        gold_file.name, pred_file.name, m, official_stdout)
176
        for m in ("muc", "bcub", "ceafe")}
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 4 spaces).
Loading history...
177